Upload 5 files
Browse files- README.md +66 -3
- config.json +22 -0
- gitattributes +38 -0
- model.safetensors +3 -0
- scgpt_gh_repo_original_model.bin +3 -0
README.md
CHANGED
|
@@ -1,3 +1,66 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- single-cell
|
| 5 |
+
- biology
|
| 6 |
+
base_model:
|
| 7 |
+
- MohamedMabrouk/scGPT
|
| 8 |
+
---
|
| 9 |
+
# scGPT
|
| 10 |
+
scGPT is A foundation model for single-cell biology based on a generative pre trained transformer across a repository of over 33 million cells.
|
| 11 |
+
|
| 12 |
+
# Abstract
|
| 13 |
+
Generative pretrained models have achieved remarkable success in various domains such as language and computer vision. Specifically, the combination of large-scale diverse datasets and pretrained transformers has emerged as a promising approach for developing foundation models. Drawing parallels between language and cellular biology (in which texts comprise words; similarly, cells are defined by genes), our study probes the applicability of foundation models to advance cellular biology and genetic research. Using burgeoning single-cell sequencing data, we have constructed a foundation model for single-cell biology, scGPT, based on a generative pretrained transformer across a repository of over 33 million cells. Our findings illustrate that scGPT effectively distills critical biological insights concerning genes and cells. Through further adaptation of transfer learning, scGPT can be optimized to achieve superior performance across diverse downstream applications. This includes tasks such as cell type annotation, multi-batch integration, multi-omic integration, perturbation response prediction and gene network inference.
|
| 14 |
+
|
| 15 |
+
# Code
|
| 16 |
+
|
| 17 |
+
```python
|
| 18 |
+
from tdc.multi_pred.anndata_dataset import DataLoader
|
| 19 |
+
from tdc import tdc_hf_interface
|
| 20 |
+
from tdc.model_server.tokenizers.scgpt import scGPTTokenizer
|
| 21 |
+
import torch
|
| 22 |
+
|
| 23 |
+
# an example dataset
|
| 24 |
+
adata = DataLoader("cellxgene_sample_small",
|
| 25 |
+
"./data",
|
| 26 |
+
dataset_names=["cellxgene_sample_small"],
|
| 27 |
+
no_convert=True).adata
|
| 28 |
+
|
| 29 |
+
# code for loading the model and performing inference
|
| 30 |
+
scgpt = tdc_hf_interface("scGPT")
|
| 31 |
+
model = scgpt.load() # This line can cause segmentation fault on inappropriate setup
|
| 32 |
+
tokenizer = scGPTTokenizer()
|
| 33 |
+
gene_ids = adata.var["feature_name"].to_numpy(
|
| 34 |
+
) # Convert to numpy array
|
| 35 |
+
tokenized_data = tokenizer.tokenize_cell_vectors(
|
| 36 |
+
adata.X.toarray(), gene_ids)
|
| 37 |
+
mask = torch.tensor([x != 0 for x in tokenized_data[0][1]],
|
| 38 |
+
dtype=torch.bool)
|
| 39 |
+
|
| 40 |
+
# Extract first embedding
|
| 41 |
+
first_embed = model(tokenized_data[0][0],
|
| 42 |
+
tokenized_data[0][1],
|
| 43 |
+
attention_mask=mask)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
# TDC.scGPT Source Code
|
| 47 |
+
https://github.com/mims-harvard/TDC/blob/main/tdc/model_server/models/scgpt.py
|
| 48 |
+
* hf migration code available upon request
|
| 49 |
+
* weights extracted from base model
|
| 50 |
+
|
| 51 |
+
# TDC Citation
|
| 52 |
+
```
|
| 53 |
+
@inproceedings{
|
| 54 |
+
velez-arce2024signals,
|
| 55 |
+
title={Signals in the Cells: Multimodal and Contextualized Machine Learning Foundations for Therapeutics},
|
| 56 |
+
author={Alejandro Velez-Arce and Xiang Lin and Kexin Huang and Michelle M Li and Wenhao Gao and Bradley Pentelute and Tianfan Fu and Manolis Kellis and Marinka Zitnik},
|
| 57 |
+
booktitle={NeurIPS 2024 Workshop on AI for New Drug Modalities},
|
| 58 |
+
year={2024},
|
| 59 |
+
url={https://openreview.net/forum?id=kL8dlYp6IM}
|
| 60 |
+
}
|
| 61 |
+
```
|
| 62 |
+
# Additional Citations
|
| 63 |
+
- Cui, H., Wang, C., Maan, H. et al. scGPT: toward building a foundation model for single-cell multi-omics using generative AI. Nat Methods 21, 1470–1480 (2024). https://doi.org/10.1038/s41592-024-02201-0
|
| 64 |
+
|
| 65 |
+
# Model Github
|
| 66 |
+
https://github.com/bowang-lab/scGPT
|
config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"ScGPTModel"
|
| 4 |
+
],
|
| 5 |
+
"cell_emb_style": "cls",
|
| 6 |
+
"d_hid": 512,
|
| 7 |
+
"dropout": 0.0,
|
| 8 |
+
"embsize": 512,
|
| 9 |
+
"explicit_zero_prob": false,
|
| 10 |
+
"input_emb_style": "continuous",
|
| 11 |
+
"max_seq_len": 1536,
|
| 12 |
+
"model_type": "scgpt",
|
| 13 |
+
"nhead": 8,
|
| 14 |
+
"nlayers": 12,
|
| 15 |
+
"norm_scheme": "post",
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"torch_dtype": "float32",
|
| 18 |
+
"transformers_version": "4.43.4",
|
| 19 |
+
"use_fast_transformer": true,
|
| 20 |
+
"use_flash_attention": false,
|
| 21 |
+
"vocab_size": 60697
|
| 22 |
+
}
|
gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
<<<<<<< HEAD
|
| 29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
=======
|
| 31 |
+
>>>>>>> 23519966c9f2ad55dae4b35e50ce64a1aa17a70b
|
| 32 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cabd40e6e22514865825975940f2c61ac1395f3317bba9f773858cd72914064c
|
| 3 |
+
size 203233980
|
scgpt_gh_repo_original_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad0252a1971e0cd619b7116dbab3177432236c4537225d54280a2aa7e5fe402a
|
| 3 |
+
size 207861754
|