Bug fixes in stats and added gene dict

#12
by davidjwen - opened
This view is limited to 50 files because it contains too many changes.  See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +2 -3
  2. .pre-commit-config.yaml +0 -26
  3. .readthedocs.yaml +0 -19
  4. Geneformer-V1-10M/config.json +0 -23
  5. Geneformer-V1-10M/model.safetensors +0 -3
  6. Geneformer-V1-10M/training_args.bin +0 -3
  7. Geneformer-V2-104M/config.json +0 -24
  8. Geneformer-V2-104M/generation_config.json +0 -5
  9. Geneformer-V2-104M/model.safetensors +0 -3
  10. Geneformer-V2-104M/training_args.bin +0 -3
  11. Geneformer-V2-104M_CLcancer/config.json +0 -25
  12. Geneformer-V2-104M_CLcancer/generation_config.json +0 -5
  13. Geneformer-V2-104M_CLcancer/model.safetensors +0 -3
  14. Geneformer-V2-104M_CLcancer/training_args.bin +0 -3
  15. Geneformer-V2-316M/config.json +0 -24
  16. Geneformer-V2-316M/generation_config.json +0 -5
  17. Geneformer-V2-316M/model.safetensors +0 -3
  18. Geneformer-V2-316M/training_args.bin +0 -3
  19. MANIFEST.in +2 -9
  20. README.md +8 -69
  21. benchmarking/castle_cell_type_annotation.r +80 -0
  22. benchmarking/prepare_datasplits_for_cell_type_annotation.ipynb +288 -0
  23. benchmarking/randomForest_token_classifier_dosageTF_10k.ipynb +0 -0
  24. benchmarking/scDeepsort_train_predict.ipynb +166 -0
  25. config.json +10 -11
  26. docs/Makefile +0 -20
  27. docs/make.bat +0 -35
  28. docs/requirements.txt +0 -3
  29. docs/source/_static/css/custom.css +0 -40
  30. docs/source/_static/gf_logo.png +0 -0
  31. docs/source/about.rst +0 -49
  32. docs/source/api.rst +0 -51
  33. docs/source/conf.py +0 -80
  34. docs/source/geneformer.classifier.rst +0 -10
  35. docs/source/geneformer.emb_extractor.rst +0 -26
  36. docs/source/geneformer.in_silico_perturber.rst +0 -8
  37. docs/source/geneformer.in_silico_perturber_stats.rst +0 -25
  38. docs/source/geneformer.mtl_classifier.rst +0 -11
  39. docs/source/geneformer.tokenizer.rst +0 -15
  40. docs/source/getstarted.rst +0 -36
  41. docs/source/index.rst +0 -16
  42. examples/cell_classification.ipynb +0 -0
  43. examples/distributed_multitask_cell_classification.ipynb +0 -149
  44. examples/example_input_files/bivalent_promoters/bivalent_gene_labels.txt +107 -0
  45. examples/example_input_files/bivalent_promoters/lys4_only_gene_labels.txt +80 -0
  46. examples/example_input_files/bivalent_promoters/no_methylation_gene_labels.txt +42 -0
  47. examples/example_input_files/dosage_sensitive_tfs/dosage_sens_tf_labels.csv +369 -0
  48. examples/example_input_files/gene_info_table.csv +0 -0
  49. examples/extract_and_plot_cell_embeddings.ipynb +0 -0
  50. examples/gene_classification.ipynb +0 -0
.gitattributes CHANGED
@@ -14,11 +14,10 @@
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
17
- *.pkl filter=lfs diff=lfs merge=lfs -text
18
  *.pt filter=lfs diff=lfs merge=lfs -text
19
  *.pth filter=lfs diff=lfs merge=lfs -text
20
  *.rar filter=lfs diff=lfs merge=lfs -text
21
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
22
  *.tar.* filter=lfs diff=lfs merge=lfs -text
23
  *.tflite filter=lfs diff=lfs merge=lfs -text
24
  *.tgz filter=lfs diff=lfs merge=lfs -text
@@ -26,4 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
  *.zip filter=lfs diff=lfs merge=lfs -text
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
29
- model.safetensors filter=lfs diff=lfs merge=lfs -text
 
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ geneformer/gene_name_id_dict.pkl filter=lfs diff=lfs merge=lfs -text
.pre-commit-config.yaml DELETED
@@ -1,26 +0,0 @@
1
- # See https://pre-commit.com for more information
2
- # See https://pre-commit.com/hooks.html for more hooks
3
- repos:
4
- - repo: https://github.com/pre-commit/pre-commit-hooks
5
- rev: v3.2.0
6
- hooks:
7
- - id: trailing-whitespace
8
- - id: end-of-file-fixer
9
- - id: check-yaml
10
- - id: check-added-large-files
11
- - id: check-merge-conflict
12
- - id: mixed-line-ending
13
- - id: check-docstring-first
14
- - repo: https://github.com/pycqa/isort
15
- rev: 5.12.0
16
- hooks:
17
- - id: isort
18
- args: ["--profile", "black"]
19
- - repo: https://github.com/astral-sh/ruff-pre-commit
20
- # Ruff version.
21
- rev: v0.1.4
22
- hooks:
23
- # Run the Ruff linter.
24
- - id: ruff
25
- # Run the Ruff formatter.
26
- - id: ruff-format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.readthedocs.yaml DELETED
@@ -1,19 +0,0 @@
1
- # Read the Docs configuration file
2
-
3
- # Required
4
- version: 2
5
-
6
- # Set the OS, Python version and other tools you might need
7
- build:
8
- os: ubuntu-22.04
9
- tools:
10
- python: "3.10"
11
-
12
- # Build documentation in the "docs/" directory with Sphinx
13
- sphinx:
14
- configuration: docs/source/conf.py
15
-
16
- # Python requirements required build your documentation
17
- python:
18
- install:
19
- - requirements: docs/requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Geneformer-V1-10M/config.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertForMaskedLM"
4
- ],
5
- "attention_probs_dropout_prob": 0.02,
6
- "gradient_checkpointing": false,
7
- "hidden_act": "relu",
8
- "hidden_dropout_prob": 0.02,
9
- "hidden_size": 256,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 512,
12
- "layer_norm_eps": 1e-12,
13
- "max_position_embeddings": 2048,
14
- "model_type": "bert",
15
- "num_attention_heads": 4,
16
- "num_hidden_layers": 6,
17
- "pad_token_id": 0,
18
- "position_embedding_type": "absolute",
19
- "transformers_version": "4.6.0",
20
- "type_vocab_size": 2,
21
- "use_cache": true,
22
- "vocab_size": 25426
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Geneformer-V1-10M/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5e33a757431643b3697de7ef6127950cdc49e06e58d4266b3a3ab191b683f14
3
- size 41183536
 
 
 
 
Geneformer-V1-10M/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0ec3459454205174c9d2e4d6c6930f6b0fbf3364fc03a6f4d99c4d3add2012b
3
- size 2607
 
 
 
 
Geneformer-V2-104M/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertForMaskedLM"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
- "hidden_act": "relu",
8
- "hidden_dropout_prob": 0.1,
9
- "hidden_size": 768,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 3072,
12
- "layer_norm_eps": 1e-12,
13
- "max_position_embeddings": 4096,
14
- "model_type": "bert",
15
- "num_attention_heads": 12,
16
- "num_hidden_layers": 12,
17
- "pad_token_id": 0,
18
- "position_embedding_type": "absolute",
19
- "torch_dtype": "float32",
20
- "transformers_version": "4.44.2",
21
- "type_vocab_size": 2,
22
- "use_cache": true,
23
- "vocab_size": 20275
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Geneformer-V2-104M/generation_config.json DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "pad_token_id": 0,
4
- "transformers_version": "4.44.2"
5
- }
 
 
 
 
 
 
Geneformer-V2-104M/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fff5cba29ddd8792991fa77b4872246fbe548a178cebda3775cdc72b67780e7f
3
- size 417571156
 
 
 
 
Geneformer-V2-104M/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d8ddd9e4f35b5fe23a3adaae03aa4480705ca82eed546a488f970adb3752d9d
3
- size 5496
 
 
 
 
Geneformer-V2-104M_CLcancer/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "_name_or_path": "/gladstone/theodoris/lab/ctheodoris/gf-104m/models/241127_143148_geneformer_94M_L12_emb768_SL4096_E3_B18_LR0.0002_LScosine_WR0.007_Oadamw_DS13/models",
3
- "architectures": [
4
- "BertForMaskedLM"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "classifier_dropout": null,
8
- "hidden_act": "relu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 768,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 3072,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 4096,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 12,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "torch_dtype": "float32",
21
- "transformers_version": "4.37.1",
22
- "type_vocab_size": 2,
23
- "use_cache": true,
24
- "vocab_size": 20275
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Geneformer-V2-104M_CLcancer/generation_config.json DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "pad_token_id": 0,
4
- "transformers_version": "4.37.1"
5
- }
 
 
 
 
 
 
Geneformer-V2-104M_CLcancer/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:827738139bfed4bafa9d1f3df7c6146da2e3b85f7225076adc32c6eda0ba4357
3
- size 417571156
 
 
 
 
Geneformer-V2-104M_CLcancer/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cf8ce52b498253adc6df53197a99821fa145c19b8ae5eeb8d15be76b8b7ddb3
3
- size 4984
 
 
 
 
Geneformer-V2-316M/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertForMaskedLM"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
- "hidden_act": "relu",
8
- "hidden_dropout_prob": 0.1,
9
- "hidden_size": 1152,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 4608,
12
- "layer_norm_eps": 1e-12,
13
- "max_position_embeddings": 4096,
14
- "model_type": "bert",
15
- "num_attention_heads": 18,
16
- "num_hidden_layers": 18,
17
- "pad_token_id": 0,
18
- "position_embedding_type": "absolute",
19
- "torch_dtype": "float32",
20
- "transformers_version": "4.44.2",
21
- "type_vocab_size": 2,
22
- "use_cache": true,
23
- "vocab_size": 20275
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Geneformer-V2-316M/generation_config.json DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "pad_token_id": 0,
4
- "transformers_version": "4.44.2"
5
- }
 
 
 
 
 
 
Geneformer-V2-316M/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:965ceccea81953d362081ef3843560a0e4fef88d396c28017881f1e94b1246f3
3
- size 1265455076
 
 
 
 
Geneformer-V2-316M/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e45150f9a4ca34cb4e91ce79f65f3d99d9d66df9f66a37517a352d291008e0b8
3
- size 5432
 
 
 
 
MANIFEST.in CHANGED
@@ -1,9 +1,2 @@
1
- include geneformer/gene_median_dictionary_gc104M.pkl
2
- include geneformer/gene_name_id_dict_gc104M.pkl
3
- include geneformer/ensembl_mapping_dict_gc104M.pkl
4
- include geneformer/token_dictionary_gc104M.pkl
5
-
6
- include geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30M.pkl
7
- include geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30M.pkl
8
- include geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl
9
- include geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl
 
1
+ include geneformer/gene_median_dictionary.pkl
2
+ include geneformer/token_dictionary.pkl
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,91 +1,30 @@
1
  ---
2
  datasets: ctheodoris/Genecorpus-30M
3
- license: apache-2.0
4
- tags:
5
- - single-cell
6
- - genomics
7
  ---
8
  # Geneformer
9
- Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.
10
 
11
- - See [our manuscript](https://rdcu.be/ddrx0) for details of the original model trained on ~30 million transcriptomes in June 2021 and the initial report of our in silico perturbation and cell and gene classification strategies.
12
- - See [our manuscript](https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf) for details of the expanded model, now trained on ~104 million transcriptomes, and our continual learning, multitask learning, and quantization strategies.
13
- - See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
14
 
15
  # Model Description
16
- Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes representing a broad range of human tissues. Geneformer V1 was originally pretrained in June 2021 on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a corpus comprised of ~30 million human single cell transcriptomes. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. The current updated Geneformer V2 is pretrained on ~104 million human single cell transcriptomes (non-cancer). The cancer continual learning V2 variant was continually pretrained on ~14 million cancer transcriptomes to yield a cancer domain-tuned model.
17
 
18
- Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell scaled by their expression across the entire Genecorpus (~30M for V1, ~104M for V2). The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across the pretraining corpus to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by scaling them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
19
 
20
- The rank value encoding of each single cell’s transcriptome then proceeds through N layers of transformer encoder units, where N varies dependent on the model size. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
21
 
22
- We detail applications and results in [our manuscript](https://rdcu.be/ddrx0).
23
-
24
- During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the model’s attention weights in a completely self-supervised manner. With both zero-shot learning and fine-tuning with limited task-specific data, Geneformer consistently boosted predictive accuracy in a diverse panel of downstream tasks relevant to chromatin and network dynamics. In silico perturbation with zero-shot learning identified a novel transcription factor in cardiomyocytes that we experimentally validated to be critical to their ability to generate contractile force. In silico treatment with limited patient data revealed candidate therapeutic targets for cardiomyopathy that we experimentally validated to significantly improve the ability of cardiomyocytes to generate contractile force in an induced pluripotent stem cell (iPSC) model of the disease. Overall, Geneformer represents a foundational AI model pretrained on a large-scale corpus human single cell transcriptomes to gain a fundamental understanding of gene network dynamics that can now be democratized to a vast array of downstream tasks to accelerate discovery of key network regulators and candidate therapeutic targets.
25
-
26
- The repository includes the following pretrained models:
27
-
28
- - Geneformer-V1-10M: original model trained June 2021 on ~30M human single cell transcriptomes, 10M parameters, input size 2048, vocabulary ~25K protein-coding or non-coding RNA genes
29
- - Geneformer-V2-104M and Geneformer-V2-316M: updated model trained Dec 2024 on ~104M human single cell transcriptomes, 104M or 316M parameters, input size 4096, vocabulary ~20K protein-coding genes
30
-
31
- The current default model in the main directory of the repository is Geneformer-V2-316M.
32
-
33
- The repository also contains fined tuned models in the fine_tuned_models directory and the cancer-tuned model following continual learning on ~14 million cancer cells, Geneformer-V2-104M_CLcancer.
34
 
35
  # Application
36
  The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
37
 
38
- Example applications demonstrated in [our manuscript](https://rdcu.be/ddrx0) include:
39
-
40
- *Fine-tuning*:
41
- - transcription factor dosage sensitivity
42
- - chromatin dynamics (bivalently marked promoters)
43
- - transcription factor regulatory range
44
- - gene network centrality
45
- - transcription factor targets
46
- - cell type annotation
47
- - batch integration
48
- - cell state classification across differentiation
49
- - disease classification
50
- - in silico perturbation to determine disease-driving genes
51
- - in silico treatment to determine candidate therapeutic targets
52
-
53
- *Zero-shot learning*:
54
- - batch integration
55
- - gene context specificity
56
- - in silico reprogramming
57
- - in silico differentiation
58
- - in silico perturbation to determine impact on cell state
59
- - in silico perturbation to determine transcription factor targets
60
- - in silico perturbation to determine transcription factor cooperativity
61
-
62
  # Installation
63
- In addition to the pretrained model, contained herein are functions for tokenizing and collating data specific to single cell transcriptomics, pretraining the model, fine-tuning the model, extracting and plotting cell embeddings, and performing in silico pertrubation with either the pretrained or fine-tuned models. To install (~20s):
64
 
65
  ```bash
66
- # Make sure you have git-lfs installed (https://git-lfs.com)
67
- git lfs install
68
  git clone https://huggingface.co/ctheodoris/Geneformer
69
  cd Geneformer
70
  pip install .
71
  ```
72
 
73
- For usage, see [examples](https://huggingface.co/ctheodoris/Geneformer/tree/main/examples) for:
74
- - tokenizing transcriptomes
75
- - pretraining
76
- - hyperparameter tuning
77
- - fine-tuning
78
- - extracting and plotting cell embeddings
79
- - in silico perturbation
80
-
81
- Please also see [here](https://tinyurl.com/geneformertutorial) for a quickstart tutorial for predicting candidate therapeutic targets with Geneformer.
82
-
83
- Complete documentation is available at https://geneformer.readthedocs.io/en/latest/.
84
-
85
- Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the [example_input_files directory](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files) in the dataset repository, but these only represent a few example fine-tuning applications.
86
-
87
- Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.). Importantly, as usual for deep learning models, there are no uniformly applicable default hyperparameters for Geneformer.
88
-
89
- # Citations
90
- - C V Theodoris#, L Xiao, A Chopra, M D Chaffin, Z R Al Sayed, M C Hill, H Mantineo, E Brydon, Z Zeng, X S Liu, P T Ellinor#. Transfer learning enables predictions in network biology. _**Nature**_, 31 May 2023. (#co-corresponding authors)
91
- - H Chen*, M S Venkatesh*, J Gomez Ortega, S V Mahesh, T Nandi, R Madduri, K Pelka†, C V Theodoris†#. Quantized multi-task learning for context-specific representations of gene network dynamics. _**bioRxiv**_, 19 Aug 2024. (*co-first authors, †co-senior authors, #corresponding author)
 
1
  ---
2
  datasets: ctheodoris/Genecorpus-30M
 
 
 
 
3
  ---
4
  # Geneformer
5
+ Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.
6
 
7
+ See [our manuscript](https://www.nature.com/articles/s41586-023-06139-9) for details.
 
 
8
 
9
  # Model Description
10
+ Geneformer is a foundation transformer model pretrained on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a pretraining corpus comprised of ~30 million single cell transcriptomes from a broad range of human tissues. Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell normalized by their expression across the entire Genecorpus-30M. The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across Genecorpus-30M to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by normalizing them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
11
 
12
+ The rank value encoding of each single cell’s transcriptome then proceeds through six transformer encoder units. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
13
 
14
+ We detail applications and results in [our manuscript](https://www.nature.com/articles/s41586-023-06139-9).
15
 
16
+ During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the model’s attention weights in a completely self-supervised manner. Fine-tuning Geneformer towards a diverse panel of downstream tasks relevant to chromatin and network dynamics using limited task-specific data demonstrated that Geneformer consistently boosted predictive accuracy. Applied to disease modeling with limited patient data, Geneformer identified candidate therapeutic targets. Overall, Geneformer represents an invaluable pretrained model from which fine-tuning towards a broad range of downstream applications can be pursued to accelerate discovery of key network regulators and candidate therapeutic targets.
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # Application
19
  The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Installation
22
+ In addition to the pretrained model, contained herein are functions for tokenizing and collating data specific to single cell transcriptomics, pretraining the model, and performing in silico pertrubation with either the pretrained or fine-tuned models. To install:
23
 
24
  ```bash
 
 
25
  git clone https://huggingface.co/ctheodoris/Geneformer
26
  cd Geneformer
27
  pip install .
28
  ```
29
 
30
+ For usage, see [examples](https://huggingface.co/ctheodoris/Geneformer/tree/main/examples) for pretraining and fine-tuning. Please note that GPU resources are required for efficient usage of Geneformer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarking/castle_cell_type_annotation.r ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Usage: Rscript castle_cell_type_annotation.r organ
2
+
3
+ # parse ordered arguments
4
+ args <- commandArgs(trailingOnly=TRUE)
5
+ organ <- args[1]
6
+
7
+ suppressPackageStartupMessages(library(scater))
8
+ suppressPackageStartupMessages(library(xgboost))
9
+ suppressPackageStartupMessages(library(igraph))
10
+ BREAKS=c(-1, 0, 1, 6, Inf)
11
+ nFeatures = 100
12
+
13
+ print(paste("Training ", organ, sep=""))
14
+
15
+ # import training and test data
16
+ rootdir="/path/to/data/"
17
+ train_counts <- t(as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_data_train.csv", sep=""), row.names = 1)))
18
+ test_counts <- t(as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_data_test.csv", sep=""), row.names = 1)))
19
+ train_celltype <- as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_celltype_train.csv", sep="")))
20
+ test_celltype <- as.matrix(read.csv(file = paste(rootdir, organ, "_filtered_celltype_test.csv", sep="")))
21
+
22
+ # select features
23
+ sourceCellTypes = as.factor(train_celltype[,"Cell_type"])
24
+ ds = rbind(train_counts,test_counts)
25
+ ds[is.na(ds)] <- 0
26
+ isSource = c(rep(TRUE,nrow(train_counts)), rep(FALSE,nrow(test_counts)))
27
+ topFeaturesAvg = colnames(ds[isSource,])[order(apply(ds[isSource,], 2, mean), decreasing = T)]
28
+ topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),sourceCellTypes,method = "nmi") }), decreasing = T))
29
+ selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
30
+ tmp = cor(ds[isSource,selectedFeatures], method = "pearson")
31
+ tmp[!lower.tri(tmp)] = 0
32
+ selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
33
+ remove(tmp)
34
+
35
+ # bin expression values and expand features by bins
36
+ dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
37
+ nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
38
+ ds = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
39
+ remove(dsBins, nUniq)
40
+
41
+ # train model
42
+ train = runif(nrow(ds[isSource,]))<0.8
43
+ # slightly different setup for multiclass and binary classification
44
+ if (length(unique(sourceCellTypes)) > 2) {
45
+ xg=xgboost(data=ds[isSource,][train, ] ,
46
+ label=as.numeric(sourceCellTypes[train])-1,
47
+ objective="multi:softmax", num_class=length(unique(sourceCellTypes)),
48
+ eta=0.7 , nthread=5, nround=20, verbose=0,
49
+ gamma=0.001, max_depth=5, min_child_weight=10)
50
+ } else {
51
+ xg=xgboost(data=ds[isSource,][train, ] ,
52
+ label=as.numeric(sourceCellTypes[train])-1,
53
+ eta=0.7 , nthread=5, nround=20, verbose=0,
54
+ gamma=0.001, max_depth=5, min_child_weight=10)
55
+ }
56
+
57
+ # validate model
58
+ predictedClasses = predict(xg, ds[!isSource, ])
59
+ testCellTypes = as.factor(test_celltype[,"Cell_type"])
60
+ trueClasses <- as.numeric(testCellTypes)-1
61
+
62
+ cm <- as.matrix(table(Actual = trueClasses, Predicted = predictedClasses))
63
+ n <- sum(cm)
64
+ nc = nrow(cm) # number of classes
65
+ diag = diag(cm) # number of correctly classified instances per class
66
+ rowsums = apply(cm, 1, sum) # number of instances per class
67
+ colsums = apply(cm, 2, sum) # number of predictions per class
68
+ p = rowsums / n # distribution of instances over the actual classes
69
+ q = colsums / n # distribution of instances over the predicted classes
70
+ accuracy = sum(diag) / n
71
+ precision = diag / colsums
72
+ recall = diag / rowsums
73
+ f1 = 2 * precision * recall / (precision + recall)
74
+ macroF1 = mean(f1)
75
+
76
+ print(paste(organ, " accuracy: ", accuracy, sep=""))
77
+ print(paste(organ, " macroF1: ", macroF1, sep=""))
78
+
79
+ results_df = data.frame(Accuracy=c(accuracy),macroF1=c(macroF1))
80
+ write.csv(results_df,paste(rootdir, organ, "_castle_results_test.csv", sep=""), row.names = FALSE)
benchmarking/prepare_datasplits_for_cell_type_annotation.ipynb ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "25107132",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Preparing train and test data splits for cell type annotation application"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 3,
14
+ "id": "83d8d249-affe-45dd-915e-992b4b35b31a",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "import pandas as pd\n",
20
+ "from sklearn.model_selection import train_test_split\n",
21
+ "from tqdm.notebook import tqdm\n",
22
+ "from collections import Counter\n",
23
+ "import pickle"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 4,
29
+ "id": "e3e6a2bf-44c8-4164-9ecd-1686230ea8be",
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "data": {
34
+ "text/plain": [
35
+ "['pancreas',\n",
36
+ " 'liver',\n",
37
+ " 'blood',\n",
38
+ " 'lung',\n",
39
+ " 'spleen',\n",
40
+ " 'placenta',\n",
41
+ " 'colorectum',\n",
42
+ " 'kidney',\n",
43
+ " 'brain']"
44
+ ]
45
+ },
46
+ "execution_count": 4,
47
+ "metadata": {},
48
+ "output_type": "execute_result"
49
+ }
50
+ ],
51
+ "source": [
52
+ "rootdir = \"/path/to/data/\"\n",
53
+ "\n",
54
+ "# collect panel of tissues to test\n",
55
+ "dir_list = []\n",
56
+ "for dir_i in os.listdir(rootdir):\n",
57
+ " if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
58
+ " dir_list += [dir_i]\n",
59
+ "dir_list"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": 5,
65
+ "id": "0b205eec-a518-472a-ab90-dd63ef9803cd",
66
+ "metadata": {},
67
+ "outputs": [
68
+ {
69
+ "data": {
70
+ "text/html": [
71
+ "<div>\n",
72
+ "<style scoped>\n",
73
+ " .dataframe tbody tr th:only-of-type {\n",
74
+ " vertical-align: middle;\n",
75
+ " }\n",
76
+ "\n",
77
+ " .dataframe tbody tr th {\n",
78
+ " vertical-align: top;\n",
79
+ " }\n",
80
+ "\n",
81
+ " .dataframe thead th {\n",
82
+ " text-align: right;\n",
83
+ " }\n",
84
+ "</style>\n",
85
+ "<table border=\"1\" class=\"dataframe\">\n",
86
+ " <thead>\n",
87
+ " <tr style=\"text-align: right;\">\n",
88
+ " <th></th>\n",
89
+ " <th>filter_pass</th>\n",
90
+ " <th>original_cell_id</th>\n",
91
+ " </tr>\n",
92
+ " </thead>\n",
93
+ " <tbody>\n",
94
+ " <tr>\n",
95
+ " <th>0</th>\n",
96
+ " <td>0</td>\n",
97
+ " <td>C_1</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>1</th>\n",
101
+ " <td>1</td>\n",
102
+ " <td>C_2</td>\n",
103
+ " </tr>\n",
104
+ " <tr>\n",
105
+ " <th>2</th>\n",
106
+ " <td>0</td>\n",
107
+ " <td>C_3</td>\n",
108
+ " </tr>\n",
109
+ " <tr>\n",
110
+ " <th>3</th>\n",
111
+ " <td>1</td>\n",
112
+ " <td>C_4</td>\n",
113
+ " </tr>\n",
114
+ " <tr>\n",
115
+ " <th>4</th>\n",
116
+ " <td>0</td>\n",
117
+ " <td>C_5</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>...</th>\n",
121
+ " <td>...</td>\n",
122
+ " <td>...</td>\n",
123
+ " </tr>\n",
124
+ " <tr>\n",
125
+ " <th>9590</th>\n",
126
+ " <td>1</td>\n",
127
+ " <td>C_9591</td>\n",
128
+ " </tr>\n",
129
+ " <tr>\n",
130
+ " <th>9591</th>\n",
131
+ " <td>1</td>\n",
132
+ " <td>C_9592</td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>9592</th>\n",
136
+ " <td>1</td>\n",
137
+ " <td>C_9593</td>\n",
138
+ " </tr>\n",
139
+ " <tr>\n",
140
+ " <th>9593</th>\n",
141
+ " <td>1</td>\n",
142
+ " <td>C_9594</td>\n",
143
+ " </tr>\n",
144
+ " <tr>\n",
145
+ " <th>9594</th>\n",
146
+ " <td>1</td>\n",
147
+ " <td>C_9595</td>\n",
148
+ " </tr>\n",
149
+ " </tbody>\n",
150
+ "</table>\n",
151
+ "<p>9595 rows × 2 columns</p>\n",
152
+ "</div>"
153
+ ],
154
+ "text/plain": [
155
+ " filter_pass original_cell_id\n",
156
+ "0 0 C_1\n",
157
+ "1 1 C_2\n",
158
+ "2 0 C_3\n",
159
+ "3 1 C_4\n",
160
+ "4 0 C_5\n",
161
+ "... ... ...\n",
162
+ "9590 1 C_9591\n",
163
+ "9591 1 C_9592\n",
164
+ "9592 1 C_9593\n",
165
+ "9593 1 C_9594\n",
166
+ "9594 1 C_9595\n",
167
+ "\n",
168
+ "[9595 rows x 2 columns]"
169
+ ]
170
+ },
171
+ "execution_count": 5,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ }
175
+ ],
176
+ "source": [
177
+ "# dictionary of cell barcodes that passed QC filtering applied by Geneformer \n",
178
+ "# to ensure same cells were used for comparison\n",
179
+ "with open(f\"{rootdir}deepsort_filter_dict.pickle\", \"rb\") as fp:\n",
180
+ " filter_dict = pickle.load(fp)\n",
181
+ "\n",
182
+ "# for example:\n",
183
+ "filter_dict[\"human_Placenta9595_data\"]"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "id": "207e3571-0236-4493-83b3-a89b67b16cb2",
190
+ "metadata": {
191
+ "tags": []
192
+ },
193
+ "outputs": [],
194
+ "source": [
195
+ "for dir_name in tqdm(dir_list):\n",
196
+ "\n",
197
+ " df = pd.DataFrame()\n",
198
+ " ct_df = pd.DataFrame(columns=[\"Cell\",\"Cell_type\"])\n",
199
+ " \n",
200
+ " subrootdir = f\"{rootdir}{dir_name}/\"\n",
201
+ " for subdir, dirs, files in os.walk(subrootdir):\n",
202
+ " for i in range(len(files)):\n",
203
+ " file = files[i]\n",
204
+ " if file.endswith(\"_data.csv\"):\n",
205
+ " file_prefix = file.replace(\"_data.csv\",\"\")\n",
206
+ " sample_prefix = file.replace(\".csv\",\"\")\n",
207
+ " filter_df = filter_dict[sample_prefix]\n",
208
+ " sample_to_analyze = list(filter_df[filter_df[\"filter_pass\"]==1][\"original_cell_id\"])\n",
209
+ " \n",
210
+ " # collect data for each tissue\n",
211
+ " df_i = pd.read_csv(f\"{subrootdir}{file}\", index_col=0)\n",
212
+ " df_i = df_i[sample_to_analyze]\n",
213
+ " df_i.columns = [f\"{i}_{cell_id}\" for cell_id in df_i.columns]\n",
214
+ " df = pd.concat([df,df_i],axis=1)\n",
215
+ " \n",
216
+ " # collect cell type metadata\n",
217
+ " ct_df_i = pd.read_csv(f\"{subrootdir}{file_prefix}_celltype.csv\", index_col=0)\n",
218
+ " ct_df_i.columns = [\"Cell\",\"Cell_type\"]\n",
219
+ " ct_df_i[\"Cell\"] = [f\"{i}_{cell_id}\" for cell_id in ct_df_i[\"Cell\"]]\n",
220
+ " ct_df = pd.concat([ct_df,ct_df_i],axis=0)\n",
221
+ " \n",
222
+ " # per published scDeepsort method, filter data for cell types >0.5% of data\n",
223
+ " ct_counts = Counter(ct_df[\"Cell_type\"])\n",
224
+ " total_count = sum(ct_counts.values())\n",
225
+ " nonrare_cell_types = [cell_type for cell_type,count in ct_counts.items() if count>(total_count*0.005)]\n",
226
+ " nonrare_cells = list(ct_df[ct_df[\"Cell_type\"].isin(nonrare_cell_types)][\"Cell\"])\n",
227
+ " df = df[df.columns.intersection(nonrare_cells)]\n",
228
+ "\n",
229
+ " # split into 80/20 train/test data\n",
230
+ " train, test = train_test_split(df.T, test_size=0.2)\n",
231
+ " train = train.T\n",
232
+ " test = test.T \n",
233
+ " \n",
234
+ " # save filtered train/test data\n",
235
+ " train.to_csv(f\"{subrootdir}{dir_name}_filtered_data_train.csv\")\n",
236
+ " test.to_csv(f\"{subrootdir}{dir_name}_filtered_data_test.csv\")\n",
237
+ "\n",
238
+ " # split metadata into train/test data\n",
239
+ " ct_df_train = ct_df[ct_df[\"Cell\"].isin(list(train.columns))]\n",
240
+ " ct_df_test = ct_df[ct_df[\"Cell\"].isin(list(test.columns))]\n",
241
+ " train_order_dict = dict(zip(train.columns,[i for i in range(len(train.columns))]))\n",
242
+ " test_order_dict = dict(zip(test.columns,[i for i in range(len(test.columns))]))\n",
243
+ " ct_df_train[\"order\"] = [train_order_dict[cell_id] for cell_id in ct_df_train[\"Cell\"]]\n",
244
+ " ct_df_test[\"order\"] = [test_order_dict[cell_id] for cell_id in ct_df_test[\"Cell\"]]\n",
245
+ " ct_df_train = ct_df_train.sort_values(\"order\")\n",
246
+ " ct_df_test = ct_df_test.sort_values(\"order\")\n",
247
+ " ct_df_train = ct_df_train.drop(\"order\",axis=1)\n",
248
+ " ct_df_test = ct_df_test.drop(\"order\",axis=1)\n",
249
+ " assert list(ct_df_train[\"Cell\"]) == list(train.columns)\n",
250
+ " assert list(ct_df_test[\"Cell\"]) == list(test.columns)\n",
251
+ " train_labels = list(Counter(ct_df_train[\"Cell_type\"]).keys())\n",
252
+ " test_labels = list(Counter(ct_df_test[\"Cell_type\"]).keys())\n",
253
+ " assert set(train_labels) == set(test_labels)\n",
254
+ " \n",
255
+ " # save train/test cell type annotations\n",
256
+ " ct_df_train.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")\n",
257
+ " ct_df_test.to_csv(f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\")\n",
258
+ " "
259
+ ]
260
+ }
261
+ ],
262
+ "metadata": {
263
+ "kernelspec": {
264
+ "display_name": "Python 3.8.6 64-bit ('3.8.6')",
265
+ "language": "python",
266
+ "name": "python3"
267
+ },
268
+ "language_info": {
269
+ "codemirror_mode": {
270
+ "name": "ipython",
271
+ "version": 3
272
+ },
273
+ "file_extension": ".py",
274
+ "mimetype": "text/x-python",
275
+ "name": "python",
276
+ "nbconvert_exporter": "python",
277
+ "pygments_lexer": "ipython3",
278
+ "version": "3.8.6"
279
+ },
280
+ "vscode": {
281
+ "interpreter": {
282
+ "hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
283
+ }
284
+ }
285
+ },
286
+ "nbformat": 4,
287
+ "nbformat_minor": 5
288
+ }
benchmarking/randomForest_token_classifier_dosageTF_10k.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
benchmarking/scDeepsort_train_predict.ipynb ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "id": "83d8d249-affe-45dd-915e-992b4b35b31a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import numpy as np\n",
12
+ "import pandas as pd\n",
13
+ "import deepsort\n",
14
+ "from sklearn.metrics import accuracy_score, f1_score\n",
15
+ "from tqdm.notebook import tqdm\n",
16
+ "import pickle"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 4,
22
+ "id": "25de46ec-8a41-484d-8e14-d2b19768fc2c",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "def compute_metrics(labels, preds):\n",
27
+ "\n",
28
+ " # calculate accuracy and macro f1 using sklearn's function\n",
29
+ " acc = accuracy_score(labels, preds)\n",
30
+ " macro_f1 = f1_score(labels, preds, average='macro')\n",
31
+ " return {\n",
32
+ " 'accuracy': acc,\n",
33
+ " 'macro_f1': macro_f1\n",
34
+ " }"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 5,
40
+ "id": "a4029b2b-afca-4300-82a2-082fec59f191",
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "data": {
45
+ "text/plain": [
46
+ "['pancreas',\n",
47
+ " 'liver',\n",
48
+ " 'blood',\n",
49
+ " 'lung',\n",
50
+ " 'spleen',\n",
51
+ " 'placenta',\n",
52
+ " 'colorectum',\n",
53
+ " 'kidney',\n",
54
+ " 'brain']"
55
+ ]
56
+ },
57
+ "execution_count": 5,
58
+ "metadata": {},
59
+ "output_type": "execute_result"
60
+ }
61
+ ],
62
+ "source": [
63
+ "rootdir = \"/path/to/data/\"\n",
64
+ "\n",
65
+ "dir_list = []\n",
66
+ "for dir_i in os.listdir(rootdir):\n",
67
+ " if (\"results\" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):\n",
68
+ " dir_list += [dir_i]\n",
69
+ "dir_list"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "id": "ddcdc5cd-871e-4fd2-8457-18d3049fa76c",
76
+ "metadata": {
77
+ "tags": []
78
+ },
79
+ "outputs": [],
80
+ "source": [
81
+ "output_dir = \"results_EDefault_filtered\"\n",
82
+ "n_epochs = \"Default\" # scDeepsort default epochs = 300\n",
83
+ "\n",
84
+ "results_dict = dict()\n",
85
+ "for dir_name in tqdm(dir_list):\n",
86
+ " print(f\"TRAINING: {dir_name}\")\n",
87
+ " subrootdir = f\"{rootdir}{dir_name}/\"\n",
88
+ " train_files = [(f\"{subrootdir}{dir_name}_filtered_data_train.csv\",f\"{subrootdir}{dir_name}_filtered_celltype_train.csv\")]\n",
89
+ " test_file = f\"{subrootdir}{dir_name}_filtered_data_test.csv\"\n",
90
+ " label_file = f\"{subrootdir}{dir_name}_filtered_celltype_test.csv\"\n",
91
+ " \n",
92
+ " # define the model\n",
93
+ " model = deepsort.DeepSortClassifier(species='human',\n",
94
+ " tissue=dir_name,\n",
95
+ " gpu_id=0,\n",
96
+ " random_seed=1,\n",
97
+ " validation_fraction=0) # use all training data (already held out 20% in test data file)\n",
98
+ "\n",
99
+ " # fit the model\n",
100
+ " model.fit(train_files, save_path=f\"{subrootdir}{output_dir}\")\n",
101
+ " \n",
102
+ " # use the saved model to predict cell types in test data\n",
103
+ " model.predict(input_file=test_file,\n",
104
+ " model_path=f\"{subrootdir}{output_dir}\",\n",
105
+ " save_path=f\"{subrootdir}{output_dir}\",\n",
106
+ " unsure_rate=0,\n",
107
+ " file_type='csv')\n",
108
+ " labels_df = pd.read_csv(label_file)\n",
109
+ " preds_df = pd.read_csv(f\"{subrootdir}{output_dir}/human_{dir_name}_{dir_name}_filtered_data_test.csv\")\n",
110
+ " label_cell_ids = labels_df[\"Cell\"]\n",
111
+ " pred_cell_ids = preds_df[\"index\"]\n",
112
+ " assert list(label_cell_ids) == list(pred_cell_ids)\n",
113
+ " labels = list(labels_df[\"Cell_type\"])\n",
114
+ " if isinstance(preds_df[\"cell_subtype\"][0],float):\n",
115
+ " if np.isnan(preds_df[\"cell_subtype\"][0]):\n",
116
+ " preds = list(preds_df[\"cell_type\"])\n",
117
+ " results = compute_metrics(labels, preds)\n",
118
+ " else:\n",
119
+ " preds1 = list(preds_df[\"cell_type\"])\n",
120
+ " preds2 = list(preds_df[\"cell_subtype\"])\n",
121
+ " results1 = compute_metrics(labels, preds1)\n",
122
+ " results2 = compute_metrics(labels, preds2)\n",
123
+ " if results2[\"accuracy\"] > results1[\"accuracy\"]:\n",
124
+ " results = results2\n",
125
+ " else:\n",
126
+ " results = results1\n",
127
+ " \n",
128
+ " print(f\"{dir_name}: {results}\")\n",
129
+ " results_dict[dir_name] = results\n",
130
+ " with open(f\"{subrootdir}deepsort_E{n_epochs}_filtered_pred_{dir_name}.pickle\", \"wb\") as output_file:\n",
131
+ " pickle.dump(results, output_file)\n",
132
+ "\n",
133
+ "# save results\n",
134
+ "with open(f\"{rootdir}deepsort_E{n_epochs}_filtered_pred_dict.pickle\", \"wb\") as output_file:\n",
135
+ " pickle.dump(results_dict, output_file)\n",
136
+ " "
137
+ ]
138
+ }
139
+ ],
140
+ "metadata": {
141
+ "kernelspec": {
142
+ "display_name": "Python 3.8.6 64-bit ('3.8.6')",
143
+ "language": "python",
144
+ "name": "python3"
145
+ },
146
+ "language_info": {
147
+ "codemirror_mode": {
148
+ "name": "ipython",
149
+ "version": 3
150
+ },
151
+ "file_extension": ".py",
152
+ "mimetype": "text/x-python",
153
+ "name": "python",
154
+ "nbconvert_exporter": "python",
155
+ "pygments_lexer": "ipython3",
156
+ "version": "3.8.6"
157
+ },
158
+ "vscode": {
159
+ "interpreter": {
160
+ "hash": "eba1599a1f7e611c14c87ccff6793920aa63510b01fc0e229d6dd014149b8829"
161
+ }
162
+ }
163
+ },
164
+ "nbformat": 4,
165
+ "nbformat_minor": 5
166
+ }
config.json CHANGED
@@ -2,23 +2,22 @@
2
  "architectures": [
3
  "BertForMaskedLM"
4
  ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
  "hidden_act": "relu",
8
- "hidden_dropout_prob": 0.1,
9
- "hidden_size": 1152,
10
  "initializer_range": 0.02,
11
- "intermediate_size": 4608,
12
  "layer_norm_eps": 1e-12,
13
- "max_position_embeddings": 4096,
14
  "model_type": "bert",
15
- "num_attention_heads": 18,
16
- "num_hidden_layers": 18,
17
  "pad_token_id": 0,
18
  "position_embedding_type": "absolute",
19
- "torch_dtype": "float32",
20
- "transformers_version": "4.44.2",
21
  "type_vocab_size": 2,
22
  "use_cache": true,
23
- "vocab_size": 20275
24
  }
 
2
  "architectures": [
3
  "BertForMaskedLM"
4
  ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "gradient_checkpointing": false,
7
  "hidden_act": "relu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
  "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
  "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
  "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 6,
17
  "pad_token_id": 0,
18
  "position_embedding_type": "absolute",
19
+ "transformers_version": "4.6.0",
 
20
  "type_vocab_size": 2,
21
  "use_cache": true,
22
+ "vocab_size": 25426
23
  }
docs/Makefile DELETED
@@ -1,20 +0,0 @@
1
- # Minimal makefile for Sphinx documentation
2
- #
3
-
4
- # You can set these variables from the command line, and also
5
- # from the environment for the first two.
6
- SPHINXOPTS ?=
7
- SPHINXBUILD ?= sphinx-build
8
- SOURCEDIR = source
9
- BUILDDIR = build
10
-
11
- # Put it first so that "make" without argument is like "make help".
12
- help:
13
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
-
15
- .PHONY: help Makefile
16
-
17
- # Catch-all target: route all unknown targets to Sphinx using the new
18
- # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
- %: Makefile
20
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/make.bat DELETED
@@ -1,35 +0,0 @@
1
- @ECHO OFF
2
-
3
- pushd %~dp0
4
-
5
- REM Command file for Sphinx documentation
6
-
7
- if "%SPHINXBUILD%" == "" (
8
- set SPHINXBUILD=sphinx-build
9
- )
10
- set SOURCEDIR=source
11
- set BUILDDIR=build
12
-
13
- %SPHINXBUILD% >NUL 2>NUL
14
- if errorlevel 9009 (
15
- echo.
16
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17
- echo.installed, then set the SPHINXBUILD environment variable to point
18
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
19
- echo.may add the Sphinx directory to PATH.
20
- echo.
21
- echo.If you don't have Sphinx installed, grab it from
22
- echo.https://www.sphinx-doc.org/
23
- exit /b 1
24
- )
25
-
26
- if "%1" == "" goto help
27
-
28
- %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
- goto end
30
-
31
- :help
32
- %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
-
34
- :end
35
- popd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/requirements.txt DELETED
@@ -1,3 +0,0 @@
1
- .
2
- sphinx_rtd_theme==2.0.0
3
- nbsphinx==0.9.3
 
 
 
 
docs/source/_static/css/custom.css DELETED
@@ -1,40 +0,0 @@
1
- /* top left logo */
2
- .wy-side-nav-search, .wy-nav-top {
3
- background: linear-gradient(15deg, #13547a 0%, #80d0c7 100%);
4
- }
5
-
6
-
7
- /* unvisited link */
8
- .wy-nav-content a:link {
9
- color: #067abd;
10
- }
11
-
12
- /* visited link */
13
- .wy-nav-content a:visited {
14
- color: #4b827c;
15
- }
16
-
17
- /* mouse over link */
18
- .wy-nav-content a:hover {
19
- color: #80d0c7;
20
- }
21
-
22
- /* selected link */
23
- .wy-nav-content a:active {
24
- color: #4b827c;
25
- }
26
-
27
- /* class object */
28
- .sig.sig-object {
29
- padding: 5px 5px 5px 5px;
30
- background-color: #ececec;
31
- border-style: solid;
32
- border-color: black;
33
- border-width: 1px 0;
34
- }
35
-
36
- /* parameter object */
37
- dt {
38
- padding: 5px 5px 5px 5px;
39
- background-color: #ececec;
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/_static/gf_logo.png DELETED
Binary file (48.2 kB)
 
docs/source/about.rst DELETED
@@ -1,49 +0,0 @@
1
- About
2
- =====
3
-
4
- Model Description
5
- -----------------
6
-
7
- **Geneformer** is a context-aware, attention-based deep learning model pretrained on a large-scale corpus of single-cell transcriptomes to enable context-specific predictions in settings with limited data in network biology. During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the attention weights of the model in a completely self-supervised manner. With both zero-shot learning and fine-tuning with limited task-specific data, Geneformer consistently boosted predictive accuracy in a diverse panel of downstream tasks relevant to chromatin and network dynamics. In silico perturbation with zero-shot learning identified a novel transcription factor in cardiomyocytes that we experimentally validated to be critical to their ability to generate contractile force. In silico treatment with limited patient data revealed candidate therapeutic targets for cardiomyopathy that we experimentally validated to significantly improve the ability of cardiomyocytes to generate contractile force in an iPSC model of the disease. Overall, Geneformer represents a foundational deep learning model pretrained on a large-scale corpus of human single cell transcriptomes to gain a fundamental understanding of gene network dynamics that can now be democratized to a vast array of downstream tasks to accelerate discovery of key network regulators and candidate therapeutic targets.
8
-
9
- In `our manuscript <https://rdcu.be/ddrx0>`_, we report results for the original 6 layer Geneformer model pretrained on Genecorpus-30M. We additionally provide within the repository a 12 layer Geneformer model, scaled up with retained width:depth aspect ratio, also pretrained on Genecorpus-30M.
10
-
11
- Both the `6 <https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors>`_ and `12 <https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-12L-30M-i2048/pytorch_model.bin>`_ layer Geneformer models were pretrained in June 2021.
12
-
13
- Also see `our 2024 manuscript <https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf>`_, for details of the `expanded model <https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors>`_ trained on ~95 million transcriptomes in April 2024 and our continual learning, multitask learning, and quantization strategies.
14
-
15
- Application
16
- -----------
17
-
18
- The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
19
-
20
- Example applications demonstrated in `our manuscript <https://rdcu.be/ddrx0>`_ include:
21
-
22
- | *Fine-tuning*:
23
- | - transcription factor dosage sensitivity
24
- | - chromatin dynamics (bivalently marked promoters)
25
- | - transcription factor regulatory range
26
- | - gene network centrality
27
- | - transcription factor targets
28
- | - cell type annotation
29
- | - batch integration
30
- | - cell state classification across differentiation
31
- | - disease classification
32
- | - in silico perturbation to determine disease-driving genes
33
- | - in silico treatment to determine candidate therapeutic targets
34
-
35
- | *Zero-shot learning*:
36
- | - batch integration
37
- | - gene context specificity
38
- | - in silico reprogramming
39
- | - in silico differentiation
40
- | - in silico perturbation to determine impact on cell state
41
- | - in silico perturbation to determine transcription factor targets
42
- | - in silico perturbation to determine transcription factor cooperativity
43
-
44
- Citations
45
- ---------
46
-
47
- | C V Theodoris #, L Xiao, A Chopra, M D Chaffin, Z R Al Sayed, M C Hill, H Mantineo, E Brydon, Z Zeng, X S Liu, P T Ellinor #. `Transfer learning enables predictions in network biology. <https://rdcu.be/ddrx0>`_ *Nature*, 31 May 2023. (# co-corresponding authors)
48
-
49
- | H Chen \*, M S Venkatesh \*, J Gomez Ortega, S V Mahesh, T Nandi, R Madduri, K Pelka †, C V Theodoris † #. `Quantized multi-task learning for context-specific representations of gene network dynamics. <https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf>`_ *bioRxiv*, 19 Aug 2024. (\* co-first authors, † co-senior authors, # corresponding author)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/api.rst DELETED
@@ -1,51 +0,0 @@
1
- API
2
- ===
3
-
4
- Tokenizer
5
- ---------
6
-
7
- .. toctree::
8
- :maxdepth: 1
9
-
10
- geneformer.tokenizer
11
-
12
- Classifier
13
- ----------
14
-
15
- .. toctree::
16
- :maxdepth: 1
17
-
18
- geneformer.classifier
19
-
20
- Multitask Classifier
21
- --------------------
22
-
23
- .. toctree::
24
- :maxdepth: 1
25
-
26
- geneformer.mtl_classifier
27
-
28
- Embedding Extractor
29
- -------------------
30
-
31
- .. toctree::
32
- :maxdepth: 1
33
-
34
- geneformer.emb_extractor
35
-
36
- In Silico Perturber
37
- -------------------
38
-
39
- .. toctree::
40
- :maxdepth: 1
41
-
42
- geneformer.in_silico_perturber
43
-
44
-
45
- In Silico Perturber Stats
46
- -------------------------
47
-
48
- .. toctree::
49
- :maxdepth: 1
50
-
51
- geneformer.in_silico_perturber_stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/conf.py DELETED
@@ -1,80 +0,0 @@
1
- # Configuration file for the Sphinx documentation builder.
2
- #
3
- # For the full list of built-in configuration values, see the documentation:
4
- # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
-
6
- import pathlib
7
- import re
8
- import sys
9
-
10
- from sphinx.ext import autodoc
11
-
12
- sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix())
13
-
14
-
15
- # -- Project information -----------------------------------------------------
16
- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
17
-
18
- project = "geneformer"
19
- copyright = "2024, Christina Theodoris"
20
- author = "Christina Theodoris"
21
- release = "0.1.0"
22
- repository_url = "https://huggingface.co/ctheodoris/Geneformer"
23
-
24
- # -- General configuration ---------------------------------------------------
25
- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
26
-
27
- extensions = [
28
- "sphinx.ext.autodoc",
29
- "sphinx.ext.autosummary",
30
- "nbsphinx",
31
- "sphinx.ext.viewcode",
32
- "sphinx.ext.doctest",
33
- ]
34
-
35
- templates_path = ["_templates"]
36
- exclude_patterns = [
37
- "**.ipynb_checkpoints",
38
- ]
39
- autoclass_content = "both"
40
-
41
-
42
- class MockedClassDocumenter(autodoc.ClassDocumenter):
43
- def add_line(self, line: str, source: str, *lineno: int) -> None:
44
- if line == " Bases: :py:class:`object`":
45
- return
46
- super().add_line(line, source, *lineno)
47
-
48
-
49
- autodoc.ClassDocumenter = MockedClassDocumenter
50
- add_module_names = False
51
-
52
-
53
- def process_signature(app, what, name, obj, options, signature, return_annotation):
54
- # loop through each line in the docstring and replace path with
55
- # the generic path text
56
- signature = re.sub(r"PosixPath\(.*?\)", "FILEPATH", signature)
57
- return (signature, None)
58
-
59
-
60
- def setup(app):
61
- app.connect("autodoc-process-signature", process_signature)
62
-
63
-
64
- # -- Options for HTML output -------------------------------------------------
65
- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
66
-
67
- html_theme = "sphinx_rtd_theme"
68
- html_show_sphinx = False
69
- html_static_path = ["_static"]
70
- html_logo = "_static/gf_logo.png"
71
- html_theme_options = {
72
- "collapse_navigation": False,
73
- "sticky_navigation": True,
74
- "navigation_depth": 3,
75
- "logo_only": True,
76
- }
77
- html_css_files = [
78
- "css/custom.css",
79
- ]
80
- html_show_sourcelink = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/geneformer.classifier.rst DELETED
@@ -1,10 +0,0 @@
1
- geneformer.classifier
2
- =====================
3
-
4
- .. automodule:: geneformer.classifier
5
- :members:
6
- :undoc-members:
7
- :show-inheritance:
8
- :exclude-members:
9
- valid_option_dict,
10
- validate_options
 
 
 
 
 
 
 
 
 
 
 
docs/source/geneformer.emb_extractor.rst DELETED
@@ -1,26 +0,0 @@
1
- geneformer.emb\_extractor
2
- =========================
3
-
4
- .. automodule:: geneformer.emb_extractor
5
- :members:
6
- :undoc-members:
7
- :show-inheritance:
8
- :exclude-members:
9
- accumulate_tdigests,
10
- gen_heatmap_class_colors,
11
- gen_heatmap_class_dict,
12
- get_embs,
13
- label_cell_embs,
14
- label_gene_embs,
15
- make_colorbar,
16
- plot_heatmap,
17
- plot_umap,
18
- summarize_gene_embs,
19
- tdigest_mean,
20
- tdigest_median,
21
- test_emb,
22
- update_tdigest_dict,
23
- update_tdigest_dict_mean,
24
- update_tdigest_dict_median,
25
- valid_option_dict,
26
- validate_options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/geneformer.in_silico_perturber.rst DELETED
@@ -1,8 +0,0 @@
1
- geneformer.in\_silico\_perturber
2
- =======================================
3
-
4
- .. automodule:: geneformer.in_silico_perturber
5
- :members:
6
- :undoc-members:
7
- :show-inheritance:
8
- :exclude-members: valid_option_dict, validate_options, apply_additional_filters, isp_perturb_all, isp_perturb_set, , isp_perturb_all_special, isp_perturb_set_special, update_perturbation_dictionary
 
 
 
 
 
 
 
 
 
docs/source/geneformer.in_silico_perturber_stats.rst DELETED
@@ -1,25 +0,0 @@
1
- geneformer.in\_silico\_perturber\_stats
2
- ==============================================
3
-
4
- .. automodule:: geneformer.in_silico_perturber_stats
5
- :members:
6
- :undoc-members:
7
- :show-inheritance:
8
- :exclude-members:
9
- find,
10
- get_fdr,
11
- get_gene_list,
12
- get_impact_component,
13
- invert_dict,
14
- isp_aggregate_gene_shifts,
15
- isp_aggregate_grouped_perturb,
16
- isp_stats_mixture_model,
17
- isp_stats_to_goal_state,
18
- isp_stats_vs_null,
19
- n_detections,
20
- read_dict,
21
- read_dictionaries,
22
- token_to_gene_name,
23
- token_tuple_to_ensembl_ids,
24
- valid_option_dict,
25
- validate_options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/geneformer.mtl_classifier.rst DELETED
@@ -1,11 +0,0 @@
1
- geneformer.mtl\_classifier
2
- ==========================
3
-
4
- .. automodule:: geneformer.mtl_classifier
5
- :members:
6
- :undoc-members:
7
- :show-inheritance:
8
- :exclude-members:
9
- valid_option_dict,
10
- validate_options,
11
- validate_additional_options
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/geneformer.tokenizer.rst DELETED
@@ -1,15 +0,0 @@
1
- geneformer.tokenizer
2
- ====================
3
-
4
- .. automodule:: geneformer.tokenizer
5
- :members:
6
- :undoc-members:
7
- :show-inheritance:
8
- :exclude-members:
9
- create_dataset,
10
- tokenize_anndata,
11
- tokenize_files,
12
- tokenize_loom,
13
- rank_genes,
14
- tokenize_cell,
15
- sum_ensembl_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/getstarted.rst DELETED
@@ -1,36 +0,0 @@
1
- Getting Started
2
- ===============
3
-
4
- Installation
5
- ------------
6
-
7
- Geneformer installation instructions.
8
-
9
- Make sure you have git-lfs installed (https://git-lfs.com).
10
-
11
- .. code-block:: bash
12
-
13
- git lfs install
14
- git clone https://huggingface.co/ctheodoris/Geneformer
15
- cd Geneformer
16
- pip install .
17
-
18
-
19
- Tutorials
20
- ---------
21
-
22
- | See `examples <https://huggingface.co/ctheodoris/Geneformer/tree/main/examples>`_ for:
23
- | - tokenizing transcriptomes
24
- | - pretraining
25
- | - hyperparameter tuning
26
- | - fine-tuning
27
- | - extracting and plotting cell embeddings
28
- | - in silico perturbation
29
-
30
- Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the `example_input_files directory <https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files>`_ in the dataset repository, but these only represent a few example fine-tuning applications.
31
-
32
-
33
- Tips
34
- ----
35
-
36
- Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/source/index.rst DELETED
@@ -1,16 +0,0 @@
1
- Geneformer
2
- ==========
3
-
4
- Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in network biology.
5
-
6
- See `our manuscript <https://rdcu.be/ddrx0>`_ for details.
7
-
8
- Table of Contents
9
- -----------------
10
-
11
- .. toctree::
12
- :maxdepth: 2
13
-
14
- about
15
- getstarted
16
- api
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/cell_classification.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
examples/distributed_multitask_cell_classification.ipynb DELETED
@@ -1,149 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "b3266a7b",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "import os\n",
11
- "import torch\n",
12
- "from geneformer import MTLClassifier"
13
- ]
14
- },
15
- {
16
- "cell_type": "code",
17
- "execution_count": null,
18
- "id": "3e12ac9f",
19
- "metadata": {},
20
- "outputs": [],
21
- "source": [
22
- "# Define paths\n",
23
- "pretrained_path = \"/path/to/pretrained/Geneformer/model\" \n",
24
- "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
25
- "train_path = \"/path/to/train/data.dataset\"\n",
26
- "val_path = \"/path/to/val/data.dataset\"\n",
27
- "test_path = \"/path/to/test/data.dataset\"\n",
28
- "results_dir = \"/path/to/results/directory\"\n",
29
- "model_save_path = \"/path/to/model/save/path\"\n",
30
- "tensorboard_log_dir = \"/path/to/tensorboard/log/dir\"\n",
31
- "\n",
32
- "# Define tasks and hyperparameters\n",
33
- "# task_columns should be a list of column names from your dataset\n",
34
- "# Each column represents a specific classification task (e.g. cell type, disease state)\n",
35
- "task_columns = [\"cell_type\", \"disease_state\"] # Example task columns"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": null,
41
- "id": "c9bd7562",
42
- "metadata": {},
43
- "outputs": [],
44
- "source": [
45
- "# Check GPU environment\n",
46
- "num_gpus = torch.cuda.device_count()\n",
47
- "use_distributed = num_gpus > 1\n",
48
- "print(f\"Number of GPUs detected: {num_gpus}\")\n",
49
- "print(f\"Using distributed training: {use_distributed}\")\n",
50
- "\n",
51
- "# Set environment variables for distributed training when multiple GPUs are available\n",
52
- "if use_distributed:\n",
53
- " os.environ[\"MASTER_ADDR\"] = \"localhost\" # hostname\n",
54
- " os.environ[\"MASTER_PORT\"] = \"12355\" # Choose an available port\n",
55
- " print(\"Distributed environment variables set.\")"
56
- ]
57
- },
58
- {
59
- "cell_type": "code",
60
- "execution_count": null,
61
- "id": "b6ff3618",
62
- "metadata": {},
63
- "outputs": [],
64
- "source": [
65
- "#Define Hyperparameters for Optimization\n",
66
- "hyperparameters = {\n",
67
- " \"learning_rate\": {\"type\": \"float\", \"low\": 1e-5, \"high\": 1e-3, \"log\": True},\n",
68
- " \"warmup_ratio\": {\"type\": \"float\", \"low\": 0.005, \"high\": 0.01},\n",
69
- " \"weight_decay\": {\"type\": \"float\", \"low\": 0.01, \"high\": 0.1},\n",
70
- " \"dropout_rate\": {\"type\": \"float\", \"low\": 0.0, \"high\": 0.7},\n",
71
- " \"lr_scheduler_type\": {\"type\": \"categorical\", \"choices\": [\"cosine\"]},\n",
72
- " \"task_weights\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.0},\n",
73
- "}"
74
- ]
75
- },
76
- {
77
- "cell_type": "code",
78
- "execution_count": null,
79
- "id": "f665c5a7",
80
- "metadata": {},
81
- "outputs": [],
82
- "source": [
83
- "mc = MTLClassifier(\n",
84
- " task_columns=task_columns, # Our defined classification tasks\n",
85
- " study_name=\"MTLClassifier_distributed\",\n",
86
- " pretrained_path=pretrained_path,\n",
87
- " train_path=train_path,\n",
88
- " val_path=val_path,\n",
89
- " test_path=test_path,\n",
90
- " model_save_path=model_save_path,\n",
91
- " results_dir=results_dir,\n",
92
- " tensorboard_log_dir=tensorboard_log_dir,\n",
93
- " hyperparameters=hyperparameters,\n",
94
- " # Distributed training parameters\n",
95
- " distributed_training=use_distributed, # Enable distributed training if multiple GPUs available\n",
96
- " master_addr=\"localhost\" if use_distributed else None,\n",
97
- " master_port=\"12355\" if use_distributed else None,\n",
98
- " # Other training parameters\n",
99
- " n_trials=15, # Number of trials for hyperparameter optimization\n",
100
- " epochs=1, # Number of training epochs (1 suggested to prevent overfitting)\n",
101
- " batch_size=8, # Adjust based on available GPU memory\n",
102
- " gradient_accumulation_steps=4, # Accumulate gradients over multiple steps\n",
103
- " gradient_clipping=True, # Enable gradient clipping for stability\n",
104
- " max_grad_norm=1.0, # Set maximum gradient norm\n",
105
- " seed=42\n",
106
- ")"
107
- ]
108
- },
109
- {
110
- "cell_type": "code",
111
- "execution_count": null,
112
- "id": "f69f7b6a",
113
- "metadata": {},
114
- "outputs": [],
115
- "source": [
116
- "# Run Hyperparameter Optimization with Distributed Training\n",
117
- "if __name__ == \"__main__\":\n",
118
- " # This guard is required for distributed training to prevent\n",
119
- " # infinite subprocess spawning when using torch.multiprocessing\n",
120
- " mc.run_optuna_study()"
121
- ]
122
- },
123
- {
124
- "cell_type": "code",
125
- "execution_count": null,
126
- "id": "3affd5dd",
127
- "metadata": {},
128
- "outputs": [],
129
- "source": [
130
- "# Evaluate the Model on Test Data\n",
131
- "if __name__ == \"__main__\":\n",
132
- " mc.load_and_evaluate_test_model()"
133
- ]
134
- }
135
- ],
136
- "metadata": {
137
- "kernelspec": {
138
- "display_name": "bio",
139
- "language": "python",
140
- "name": "python3"
141
- },
142
- "language_info": {
143
- "name": "python",
144
- "version": "3.12.8"
145
- }
146
- },
147
- "nbformat": 4,
148
- "nbformat_minor": 5
149
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/example_input_files/bivalent_promoters/bivalent_gene_labels.txt ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ENSG00000005073
2
+ ENSG00000007372
3
+ ENSG00000007372
4
+ ENSG00000043355
5
+ ENSG00000068120
6
+ ENSG00000075891
7
+ ENSG00000078399
8
+ ENSG00000105991
9
+ ENSG00000105996
10
+ ENSG00000105997
11
+ ENSG00000106004
12
+ ENSG00000106006
13
+ ENSG00000106031
14
+ ENSG00000106038
15
+ ENSG00000107807
16
+ ENSG00000107821
17
+ ENSG00000107831
18
+ ENSG00000107859
19
+ ENSG00000107862
20
+ ENSG00000108511
21
+ ENSG00000108786
22
+ ENSG00000108797
23
+ ENSG00000110693
24
+ ENSG00000110693
25
+ ENSG00000113430
26
+ ENSG00000115844
27
+ ENSG00000117707
28
+ ENSG00000117707
29
+ ENSG00000119915
30
+ ENSG00000120068
31
+ ENSG00000120075
32
+ ENSG00000120093
33
+ ENSG00000120093
34
+ ENSG00000120094
35
+ ENSG00000122592
36
+ ENSG00000125285
37
+ ENSG00000125798
38
+ ENSG00000125813
39
+ ENSG00000125813
40
+ ENSG00000125816
41
+ ENSG00000125820
42
+ ENSG00000128573
43
+ ENSG00000128645
44
+ ENSG00000128652
45
+ ENSG00000128709
46
+ ENSG00000128710
47
+ ENSG00000128713
48
+ ENSG00000128714
49
+ ENSG00000129514
50
+ ENSG00000131196
51
+ ENSG00000131196
52
+ ENSG00000136327
53
+ ENSG00000136944
54
+ ENSG00000138083
55
+ ENSG00000139800
56
+ ENSG00000143013
57
+ ENSG00000143632
58
+ ENSG00000144355
59
+ ENSG00000148680
60
+ ENSG00000148826
61
+ ENSG00000151615
62
+ ENSG00000152192
63
+ ENSG00000152977
64
+ ENSG00000159184
65
+ ENSG00000159387
66
+ ENSG00000163412
67
+ ENSG00000163421
68
+ ENSG00000163623
69
+ ENSG00000164330
70
+ ENSG00000164438
71
+ ENSG00000164690
72
+ ENSG00000164778
73
+ ENSG00000165588
74
+ ENSG00000165588
75
+ ENSG00000165588
76
+ ENSG00000166407
77
+ ENSG00000166407
78
+ ENSG00000168505
79
+ ENSG00000168875
80
+ ENSG00000169946
81
+ ENSG00000170166
82
+ ENSG00000170178
83
+ ENSG00000170549
84
+ ENSG00000170561
85
+ ENSG00000170577
86
+ ENSG00000170689
87
+ ENSG00000173917
88
+ ENSG00000174279
89
+ ENSG00000174963
90
+ ENSG00000174963
91
+ ENSG00000175879
92
+ ENSG00000176842
93
+ ENSG00000177508
94
+ ENSG00000178573
95
+ ENSG00000182568
96
+ ENSG00000182742
97
+ ENSG00000185551
98
+ ENSG00000185551
99
+ ENSG00000187140
100
+ ENSG00000196092
101
+ ENSG00000197576
102
+ ENSG00000198807
103
+ ENSG00000253293
104
+ ENSG00000256463
105
+ ENSG00000260027
106
+ ENSG00000276644
107
+ ENSG00000285708
examples/example_input_files/bivalent_promoters/lys4_only_gene_labels.txt ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ENSG00000012048
2
+ ENSG00000033627
3
+ ENSG00000037042
4
+ ENSG00000055950
5
+ ENSG00000067596
6
+ ENSG00000069248
7
+ ENSG00000072682
8
+ ENSG00000085274
9
+ ENSG00000088035
10
+ ENSG00000088930
11
+ ENSG00000095539
12
+ ENSG00000102471
13
+ ENSG00000102967
14
+ ENSG00000104313
15
+ ENSG00000105146
16
+ ENSG00000105379
17
+ ENSG00000105982
18
+ ENSG00000105983
19
+ ENSG00000107816
20
+ ENSG00000107819
21
+ ENSG00000107829
22
+ ENSG00000107833
23
+ ENSG00000108784
24
+ ENSG00000108799
25
+ ENSG00000108828
26
+ ENSG00000108830
27
+ ENSG00000109911
28
+ ENSG00000113522
29
+ ENSG00000119487
30
+ ENSG00000120049
31
+ ENSG00000125347
32
+ ENSG00000126581
33
+ ENSG00000131374
34
+ ENSG00000131437
35
+ ENSG00000131462
36
+ ENSG00000131467
37
+ ENSG00000131469
38
+ ENSG00000131470
39
+ ENSG00000131475
40
+ ENSG00000131477
41
+ ENSG00000135272
42
+ ENSG00000135776
43
+ ENSG00000135801
44
+ ENSG00000136158
45
+ ENSG00000140262
46
+ ENSG00000140450
47
+ ENSG00000140563
48
+ ENSG00000140829
49
+ ENSG00000140830
50
+ ENSG00000145494
51
+ ENSG00000146909
52
+ ENSG00000147905
53
+ ENSG00000148688
54
+ ENSG00000148840
55
+ ENSG00000148950
56
+ ENSG00000151332
57
+ ENSG00000151338
58
+ ENSG00000165637
59
+ ENSG00000165644
60
+ ENSG00000166135
61
+ ENSG00000166136
62
+ ENSG00000166167
63
+ ENSG00000166169
64
+ ENSG00000166189
65
+ ENSG00000166197
66
+ ENSG00000166377
67
+ ENSG00000167081
68
+ ENSG00000168118
69
+ ENSG00000171421
70
+ ENSG00000175832
71
+ ENSG00000186480
72
+ ENSG00000187098
73
+ ENSG00000188554
74
+ ENSG00000196628
75
+ ENSG00000196628
76
+ ENSG00000198728
77
+ ENSG00000198728
78
+ ENSG00000198863
79
+ ENSG00000285283
80
+ ENSG00000285708
examples/example_input_files/bivalent_promoters/no_methylation_gene_labels.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ENSG00000068079
2
+ ENSG00000068383
3
+ ENSG00000075290
4
+ ENSG00000104313
5
+ ENSG00000105370
6
+ ENSG00000105374
7
+ ENSG00000105383
8
+ ENSG00000106536
9
+ ENSG00000113520
10
+ ENSG00000113525
11
+ ENSG00000118557
12
+ ENSG00000125257
13
+ ENSG00000128573
14
+ ENSG00000131471
15
+ ENSG00000131480
16
+ ENSG00000131482
17
+ ENSG00000134532
18
+ ENSG00000136319
19
+ ENSG00000138792
20
+ ENSG00000140262
21
+ ENSG00000140718
22
+ ENSG00000147488
23
+ ENSG00000147488
24
+ ENSG00000148677
25
+ ENSG00000151322
26
+ ENSG00000151322
27
+ ENSG00000156113
28
+ ENSG00000164399
29
+ ENSG00000164400
30
+ ENSG00000167749
31
+ ENSG00000167754
32
+ ENSG00000167755
33
+ ENSG00000169035
34
+ ENSG00000170927
35
+ ENSG00000182177
36
+ ENSG00000186153
37
+ ENSG00000187098
38
+ ENSG00000204764
39
+ ENSG00000213022
40
+ ENSG00000213822
41
+ ENSG00000261701
42
+ ENSG00000285708
examples/example_input_files/dosage_sensitive_tfs/dosage_sens_tf_labels.csv ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dosage_sensitive,dosage_insensitive
2
+ ENSG00000008197,ENSG00000010539
3
+ ENSG00000008441,ENSG00000011590
4
+ ENSG00000010818,ENSG00000063438
5
+ ENSG00000011332,ENSG00000063587
6
+ ENSG00000030419,ENSG00000064218
7
+ ENSG00000062194,ENSG00000064489
8
+ ENSG00000065970,ENSG00000067646
9
+ ENSG00000067082,ENSG00000075407
10
+ ENSG00000069667,ENSG00000079263
11
+ ENSG00000072736,ENSG00000081386
12
+ ENSG00000073282,ENSG00000083812
13
+ ENSG00000073861,ENSG00000083814
14
+ ENSG00000077092,ENSG00000083828
15
+ ENSG00000083307,ENSG00000083838
16
+ ENSG00000084676,ENSG00000083844
17
+ ENSG00000085276,ENSG00000085644
18
+ ENSG00000087510,ENSG00000089335
19
+ ENSG00000087903,ENSG00000089775
20
+ ENSG00000089225,ENSG00000102901
21
+ ENSG00000091656,ENSG00000103199
22
+ ENSG00000091831,ENSG00000105136
23
+ ENSG00000095951,ENSG00000105610
24
+ ENSG00000100644,ENSG00000105672
25
+ ENSG00000100968,ENSG00000106410
26
+ ENSG00000101076,ENSG00000106948
27
+ ENSG00000101412,ENSG00000109705
28
+ ENSG00000102870,ENSG00000115568
29
+ ENSG00000102935,ENSG00000117010
30
+ ENSG00000103449,ENSG00000118620
31
+ ENSG00000105698,ENSG00000119574
32
+ ENSG00000105866,ENSG00000120669
33
+ ENSG00000106689,ENSG00000121406
34
+ ENSG00000106852,ENSG00000121864
35
+ ENSG00000111249,ENSG00000122085
36
+ ENSG00000111783,ENSG00000124203
37
+ ENSG00000112033,ENSG00000124232
38
+ ENSG00000112246,ENSG00000124444
39
+ ENSG00000112561,ENSG00000124613
40
+ ENSG00000112837,ENSG00000125520
41
+ ENSG00000115112,ENSG00000127081
42
+ ENSG00000116809,ENSG00000127903
43
+ ENSG00000116833,ENSG00000127989
44
+ ENSG00000117000,ENSG00000129028
45
+ ENSG00000118263,ENSG00000129071
46
+ ENSG00000118922,ENSG00000129194
47
+ ENSG00000119547,ENSG00000130544
48
+ ENSG00000120798,ENSG00000130818
49
+ ENSG00000121068,ENSG00000131848
50
+ ENSG00000123358,ENSG00000132010
51
+ ENSG00000123411,ENSG00000132846
52
+ ENSG00000124496,ENSG00000133250
53
+ ENSG00000124813,ENSG00000134874
54
+ ENSG00000125398,ENSG00000135899
55
+ ENSG00000125618,ENSG00000136866
56
+ ENSG00000126368,ENSG00000137185
57
+ ENSG00000127152,ENSG00000137504
58
+ ENSG00000128573,ENSG00000138380
59
+ ENSG00000129173,ENSG00000140993
60
+ ENSG00000131759,ENSG00000141946
61
+ ENSG00000132005,ENSG00000142556
62
+ ENSG00000133794,ENSG00000143067
63
+ ENSG00000134046,ENSG00000144026
64
+ ENSG00000134317,ENSG00000144161
65
+ ENSG00000134323,ENSG00000145908
66
+ ENSG00000134852,ENSG00000146587
67
+ ENSG00000135111,ENSG00000147183
68
+ ENSG00000137203,ENSG00000147789
69
+ ENSG00000137270,ENSG00000148300
70
+ ENSG00000138795,ENSG00000149054
71
+ ENSG00000139083,ENSG00000149922
72
+ ENSG00000139793,ENSG00000151500
73
+ ENSG00000140548,ENSG00000151650
74
+ ENSG00000140968,ENSG00000151657
75
+ ENSG00000142611,ENSG00000152439
76
+ ENSG00000143033,ENSG00000152467
77
+ ENSG00000143171,ENSG00000152475
78
+ ENSG00000143190,ENSG00000153975
79
+ ENSG00000143355,ENSG00000155592
80
+ ENSG00000143365,ENSG00000156469
81
+ ENSG00000143373,ENSG00000157429
82
+ ENSG00000143437,ENSG00000159882
83
+ ENSG00000144355,ENSG00000159885
84
+ ENSG00000147862,ENSG00000159915
85
+ ENSG00000148516,ENSG00000160224
86
+ ENSG00000150907,ENSG00000160229
87
+ ENSG00000151090,ENSG00000160352
88
+ ENSG00000153234,ENSG00000160908
89
+ ENSG00000158055,ENSG00000160961
90
+ ENSG00000160007,ENSG00000161277
91
+ ENSG00000160094,ENSG00000162086
92
+ ENSG00000161405,ENSG00000163516
93
+ ENSG00000162761,ENSG00000164011
94
+ ENSG00000162924,ENSG00000164048
95
+ ENSG00000164683,ENSG00000164296
96
+ ENSG00000164684,ENSG00000164299
97
+ ENSG00000167182,ENSG00000165066
98
+ ENSG00000168610,ENSG00000165512
99
+ ENSG00000168916,ENSG00000165643
100
+ ENSG00000169554,ENSG00000165684
101
+ ENSG00000169946,ENSG00000166529
102
+ ENSG00000170370,ENSG00000166823
103
+ ENSG00000172733,ENSG00000166860
104
+ ENSG00000172819,ENSG00000167034
105
+ ENSG00000177463,ENSG00000167384
106
+ ENSG00000178177,ENSG00000167554
107
+ ENSG00000179348,ENSG00000167625
108
+ ENSG00000179361,ENSG00000167785
109
+ ENSG00000179456,ENSG00000167800
110
+ ENSG00000180357,ENSG00000167840
111
+ ENSG00000185551,ENSG00000167962
112
+ ENSG00000185591,ENSG00000167981
113
+ ENSG00000187098,ENSG00000168152
114
+ ENSG00000187605,ENSG00000168286
115
+ ENSG00000189308,ENSG00000168769
116
+ ENSG00000196092,ENSG00000169131
117
+ ENSG00000196482,ENSG00000169136
118
+ ENSG00000196628,ENSG00000169548
119
+ ENSG00000197757,ENSG00000169951
120
+ ENSG00000198815,ENSG00000169955
121
+ ENSG00000198945,ENSG00000169989
122
+ ENSG00000198963,ENSG00000170260
123
+ ENSG00000204231,ENSG00000170608
124
+ ,ENSG00000170954
125
+ ,ENSG00000171291
126
+ ,ENSG00000171295
127
+ ,ENSG00000171425
128
+ ,ENSG00000171443
129
+ ,ENSG00000171466
130
+ ,ENSG00000171469
131
+ ,ENSG00000171574
132
+ ,ENSG00000171606
133
+ ,ENSG00000171827
134
+ ,ENSG00000171872
135
+ ,ENSG00000171970
136
+ ,ENSG00000172000
137
+ ,ENSG00000172888
138
+ ,ENSG00000173041
139
+ ,ENSG00000173258
140
+ ,ENSG00000173480
141
+ ,ENSG00000173673
142
+ ,ENSG00000173825
143
+ ,ENSG00000174255
144
+ ,ENSG00000174652
145
+ ,ENSG00000174796
146
+ ,ENSG00000175279
147
+ ,ENSG00000175325
148
+ ,ENSG00000175395
149
+ ,ENSG00000175691
150
+ ,ENSG00000176009
151
+ ,ENSG00000176024
152
+ ,ENSG00000176083
153
+ ,ENSG00000176222
154
+ ,ENSG00000176302
155
+ ,ENSG00000176472
156
+ ,ENSG00000176678
157
+ ,ENSG00000176679
158
+ ,ENSG00000177030
159
+ ,ENSG00000177494
160
+ ,ENSG00000177599
161
+ ,ENSG00000177683
162
+ ,ENSG00000177842
163
+ ,ENSG00000177873
164
+ ,ENSG00000177932
165
+ ,ENSG00000177946
166
+ ,ENSG00000178150
167
+ ,ENSG00000178229
168
+ ,ENSG00000178338
169
+ ,ENSG00000178386
170
+ ,ENSG00000178665
171
+ ,ENSG00000178917
172
+ ,ENSG00000178928
173
+ ,ENSG00000178935
174
+ ,ENSG00000179195
175
+ ,ENSG00000179772
176
+ ,ENSG00000179774
177
+ ,ENSG00000179886
178
+ ,ENSG00000179909
179
+ ,ENSG00000179922
180
+ ,ENSG00000179930
181
+ ,ENSG00000179943
182
+ ,ENSG00000179965
183
+ ,ENSG00000180257
184
+ ,ENSG00000180346
185
+ ,ENSG00000180532
186
+ ,ENSG00000180535
187
+ ,ENSG00000180938
188
+ ,ENSG00000181135
189
+ ,ENSG00000181444
190
+ ,ENSG00000181450
191
+ ,ENSG00000181638
192
+ ,ENSG00000181894
193
+ ,ENSG00000181896
194
+ ,ENSG00000182318
195
+ ,ENSG00000182983
196
+ ,ENSG00000182986
197
+ ,ENSG00000183340
198
+ ,ENSG00000183647
199
+ ,ENSG00000183734
200
+ ,ENSG00000183850
201
+ ,ENSG00000184221
202
+ ,ENSG00000184517
203
+ ,ENSG00000184635
204
+ ,ENSG00000184677
205
+ ,ENSG00000184895
206
+ ,ENSG00000185155
207
+ ,ENSG00000185252
208
+ ,ENSG00000185404
209
+ ,ENSG00000185730
210
+ ,ENSG00000186020
211
+ ,ENSG00000186026
212
+ ,ENSG00000186051
213
+ ,ENSG00000186103
214
+ ,ENSG00000186230
215
+ ,ENSG00000186300
216
+ ,ENSG00000186376
217
+ ,ENSG00000186446
218
+ ,ENSG00000186496
219
+ ,ENSG00000186777
220
+ ,ENSG00000186812
221
+ ,ENSG00000186814
222
+ ,ENSG00000187626
223
+ ,ENSG00000187801
224
+ ,ENSG00000187821
225
+ ,ENSG00000187855
226
+ ,ENSG00000187987
227
+ ,ENSG00000188033
228
+ ,ENSG00000188095
229
+ ,ENSG00000188171
230
+ ,ENSG00000188295
231
+ ,ENSG00000188321
232
+ ,ENSG00000188629
233
+ ,ENSG00000188785
234
+ ,ENSG00000188868
235
+ ,ENSG00000189164
236
+ ,ENSG00000189190
237
+ ,ENSG00000189298
238
+ ,ENSG00000189299
239
+ ,ENSG00000196152
240
+ ,ENSG00000196172
241
+ ,ENSG00000196214
242
+ ,ENSG00000196345
243
+ ,ENSG00000196357
244
+ ,ENSG00000196378
245
+ ,ENSG00000196381
246
+ ,ENSG00000196387
247
+ ,ENSG00000196391
248
+ ,ENSG00000196417
249
+ ,ENSG00000196418
250
+ ,ENSG00000196456
251
+ ,ENSG00000196460
252
+ ,ENSG00000196466
253
+ ,ENSG00000196605
254
+ ,ENSG00000196646
255
+ ,ENSG00000196652
256
+ ,ENSG00000196670
257
+ ,ENSG00000196693
258
+ ,ENSG00000196705
259
+ ,ENSG00000196812
260
+ ,ENSG00000196946
261
+ ,ENSG00000197008
262
+ ,ENSG00000197020
263
+ ,ENSG00000197037
264
+ ,ENSG00000197044
265
+ ,ENSG00000197054
266
+ ,ENSG00000197124
267
+ ,ENSG00000197134
268
+ ,ENSG00000197162
269
+ ,ENSG00000197213
270
+ ,ENSG00000197279
271
+ ,ENSG00000197343
272
+ ,ENSG00000197360
273
+ ,ENSG00000197363
274
+ ,ENSG00000197472
275
+ ,ENSG00000197779
276
+ ,ENSG00000197841
277
+ ,ENSG00000197857
278
+ ,ENSG00000197863
279
+ ,ENSG00000197928
280
+ ,ENSG00000197933
281
+ ,ENSG00000197951
282
+ ,ENSG00000198028
283
+ ,ENSG00000198039
284
+ ,ENSG00000198046
285
+ ,ENSG00000198185
286
+ ,ENSG00000198205
287
+ ,ENSG00000198300
288
+ ,ENSG00000198315
289
+ ,ENSG00000198342
290
+ ,ENSG00000198346
291
+ ,ENSG00000198429
292
+ ,ENSG00000198440
293
+ ,ENSG00000198464
294
+ ,ENSG00000198466
295
+ ,ENSG00000198482
296
+ ,ENSG00000198538
297
+ ,ENSG00000198546
298
+ ,ENSG00000198551
299
+ ,ENSG00000198556
300
+ ,ENSG00000198633
301
+ ,ENSG00000198939
302
+ ,ENSG00000203326
303
+ ,ENSG00000204514
304
+ ,ENSG00000204519
305
+ ,ENSG00000204532
306
+ ,ENSG00000204595
307
+ ,ENSG00000204604
308
+ ,ENSG00000204644
309
+ ,ENSG00000204946
310
+ ,ENSG00000213020
311
+ ,ENSG00000213799
312
+ ,ENSG00000213973
313
+ ,ENSG00000213988
314
+ ,ENSG00000214189
315
+ ,ENSG00000215271
316
+ ,ENSG00000215372
317
+ ,ENSG00000215612
318
+ ,ENSG00000220201
319
+ ,ENSG00000221923
320
+ ,ENSG00000223547
321
+ ,ENSG00000227124
322
+ ,ENSG00000229676
323
+ ,ENSG00000229809
324
+ ,ENSG00000230797
325
+ ,ENSG00000232040
326
+ ,ENSG00000234284
327
+ ,ENSG00000234444
328
+ ,ENSG00000235109
329
+ ,ENSG00000235608
330
+ ,ENSG00000236104
331
+ ,ENSG00000236609
332
+ ,ENSG00000237440
333
+ ,ENSG00000242852
334
+ ,ENSG00000243660
335
+ ,ENSG00000245680
336
+ ,ENSG00000248483
337
+ ,ENSG00000249459
338
+ ,ENSG00000249471
339
+ ,ENSG00000249709
340
+ ,ENSG00000250571
341
+ ,ENSG00000250709
342
+ ,ENSG00000251192
343
+ ,ENSG00000251247
344
+ ,ENSG00000251369
345
+ ,ENSG00000253831
346
+ ,ENSG00000254004
347
+ ,ENSG00000256087
348
+ ,ENSG00000256223
349
+ ,ENSG00000256229
350
+ ,ENSG00000256294
351
+ ,ENSG00000256463
352
+ ,ENSG00000256683
353
+ ,ENSG00000256771
354
+ ,ENSG00000257446
355
+ ,ENSG00000257591
356
+ ,ENSG00000258405
357
+ ,ENSG00000258873
358
+ ,ENSG00000263002
359
+ ,ENSG00000264668
360
+ ,ENSG00000265763
361
+ ,ENSG00000267041
362
+ ,ENSG00000267179
363
+ ,ENSG00000267281
364
+ ,ENSG00000267508
365
+ ,ENSG00000267680
366
+ ,ENSG00000269067
367
+ ,ENSG00000269343
368
+ ,ENSG00000269699
369
+ ,ENSG00000272602
examples/example_input_files/gene_info_table.csv ADDED
The diff for this file is too large to render. See raw diff
 
examples/extract_and_plot_cell_embeddings.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
examples/gene_classification.ipynb CHANGED
The diff for this file is too large to render. See raw diff