Upload UCE 33-layer model
Browse files- .gitattributes +1 -0
- README.md +63 -0
- config.json +26 -0
- model.pt +3 -0
- new_species_protein_embeddings.csv +3 -0
- protein_embeddings/Danio_rerio.GRCz11.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Homo_sapiens.GRCh38.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Macaca_fascicularis.Macaca_fascicularis_6.0.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Macaca_mulatta.Mmul_10.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Microcebus_murinus.Mmur_3.0.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Mus_musculus.GRCm39.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Sus_scrofa.Sscrofa11.1.gene_symbol_to_embedding_ESM2.pt +3 -0
- protein_embeddings/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.gene_symbol_to_embedding_ESM2.pt +3 -0
- species_chrom.csv +0 -0
- species_offsets.pkl +3 -0
- tokens.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
new_species_protein_embeddings.csv filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# UCE 33LAYER Model
|
| 2 |
+
|
| 3 |
+
## Model Information
|
| 4 |
+
|
| 5 |
+
- **Model**: Universal Cell Embeddings (UCE)
|
| 6 |
+
- **Variant**: 33-layer Transformer
|
| 7 |
+
- **Source**: https://github.com/snap-stanford/UCE
|
| 8 |
+
- **Paper**: [Universal Cell Embeddings: A Foundation Model for Cell Biology](https://www.biorxiv.org/content/10.1101/2023.11.28.568918v1)
|
| 9 |
+
|
| 10 |
+
## Architecture
|
| 11 |
+
|
| 12 |
+
- **Layers**: 33
|
| 13 |
+
- **Model Dimension**: 1280
|
| 14 |
+
- **Attention Heads**: 20
|
| 15 |
+
- **Hidden Dimension**: 5120
|
| 16 |
+
- **Output Dimension**: 1280
|
| 17 |
+
- **Token Dimension**: 5120 (ESM2 protein embeddings)
|
| 18 |
+
|
| 19 |
+
## Usage
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
from perturblab.model.uce import UCEModel
|
| 23 |
+
|
| 24 |
+
# Load pretrained model
|
| 25 |
+
model = UCEModel.from_pretrained('./weights/uce-33layer')
|
| 26 |
+
|
| 27 |
+
# Generate embeddings
|
| 28 |
+
result = model.predict_embeddings(
|
| 29 |
+
data=adata, # or PerturbationData
|
| 30 |
+
species='human',
|
| 31 |
+
batch_size=25
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
cell_embeddings = result['cell_embeddings'] # (n_cells, 1280)
|
| 35 |
+
gene_embeddings = result['gene_embeddings'] # (n_cells, seq_len, 1280)
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Files
|
| 39 |
+
|
| 40 |
+
- `model.pt`: Model state dict
|
| 41 |
+
- `tokens.pt`: Token embeddings (ESM2-650M + chromosome tokens)
|
| 42 |
+
- `config.json`: Model configuration
|
| 43 |
+
- `species_chrom.csv`: Gene to chromosome mapping
|
| 44 |
+
- `species_offsets.pkl`: Species offsets in token file
|
| 45 |
+
- `protein_embeddings/`: Protein embeddings for each species
|
| 46 |
+
- `README.md`: This file
|
| 47 |
+
|
| 48 |
+
## Citation
|
| 49 |
+
|
| 50 |
+
```bibtex
|
| 51 |
+
@article{rosen2023universal,
|
| 52 |
+
title={Universal Cell Embeddings: A Foundation Model for Cell Biology},
|
| 53 |
+
author={Rosen, Yanay and Roohani, Yusuf and Agrawal, Ayush and Samotorcan, Leon and Consortium, Tabula Sapiens and Quake, Stephen R and Leskovec, Jure},
|
| 54 |
+
journal={bioRxiv},
|
| 55 |
+
pages={2023--11},
|
| 56 |
+
year={2023},
|
| 57 |
+
publisher={Cold Spring Harbor Laboratory}
|
| 58 |
+
}
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## License
|
| 62 |
+
|
| 63 |
+
MIT License (see original repository for details)
|
config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_series": "uce",
|
| 3 |
+
"model_name": "33layer",
|
| 4 |
+
"model_type": "embedding_extractor",
|
| 5 |
+
"nlayers": 33,
|
| 6 |
+
"d_model": 1280,
|
| 7 |
+
"nhead": 20,
|
| 8 |
+
"d_hid": 5120,
|
| 9 |
+
"output_dim": 1280,
|
| 10 |
+
"token_dim": 5120,
|
| 11 |
+
"dropout": 0.05,
|
| 12 |
+
"pad_length": 1536,
|
| 13 |
+
"sample_size": 1024,
|
| 14 |
+
"pad_token_idx": 0,
|
| 15 |
+
"chrom_token_left_idx": 1,
|
| 16 |
+
"chrom_token_right_idx": 2,
|
| 17 |
+
"cls_token_idx": 3,
|
| 18 |
+
"CHROM_TOKEN_OFFSET": 143574,
|
| 19 |
+
"species": "human",
|
| 20 |
+
"embedding_model": "ESM2",
|
| 21 |
+
"spec_chrom_csv_path": "species_chrom.csv",
|
| 22 |
+
"token_file": "tokens.pt",
|
| 23 |
+
"protein_embeddings_dir": "protein_embeddings/",
|
| 24 |
+
"offset_pkl_path": "species_offsets.pkl",
|
| 25 |
+
"batch_first": true
|
| 26 |
+
}
|
model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c48b984340a8bbfc23accf8cb5ee0b51458db4202df4616375159f0547737ba3
|
| 3 |
+
size 5686185611
|
new_species_protein_embeddings.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07397ab3828502fb7d0bab658125c47f145f989f384db8f55fe106f6826b2a54
|
| 3 |
+
size 2979205876
|
protein_embeddings/Danio_rerio.GRCz11.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c09da177ab2182ba2d4f2319c3e6b7d17325244f8af8ec32f795e4f0e929053f
|
| 3 |
+
size 537414921
|
protein_embeddings/Homo_sapiens.GRCh38.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a210e1cc7901513999b2bca3836ba9e2f203cd008be4e9a9d6412a2267de9748
|
| 3 |
+
size 410886729
|
protein_embeddings/Macaca_fascicularis.Macaca_fascicularis_6.0.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03e3ccf415ae2e4afe141111e7cb099b19c9e6f392afa422ca70eb0e1ce34737
|
| 3 |
+
size 315060873
|
protein_embeddings/Macaca_mulatta.Mmul_10.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93e00c3f15bf188d8ea4ee9ac38218d499c387aae004b56ec918f2081fa4ef47
|
| 3 |
+
size 348282889
|
protein_embeddings/Microcebus_murinus.Mmur_3.0.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3dd363d56051371737f66b7004dbe6cc7da31fc611bd79c4710639206bf929c3
|
| 3 |
+
size 301917449
|
protein_embeddings/Mus_musculus.GRCm39.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38d15388a4261ab0926d126951c7b78519eb526b63b3b5154e25251b0f7f6d35
|
| 3 |
+
size 463508361
|
protein_embeddings/Sus_scrofa.Sscrofa11.1.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce2a23ae9161b5b638b0c44b4a7ae41f6e38b95b56fc563bd380288acd8b358e
|
| 3 |
+
size 327456521
|
protein_embeddings/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.gene_symbol_to_embedding_ESM2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98dd20fb50f99094f9eff4ff661c9cfd8f5d6f96121c256f51d4dd3b0171fca7
|
| 3 |
+
size 280532425
|
species_chrom.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
species_offsets.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abda5b2bc4018187e408623b292686a061912f449daceb4c9c9603caf0d62538
|
| 3 |
+
size 139
|
tokens.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50f00d8ed76e39aa5d57692a93ce6c4298134913a657a2ac2f3a0993cb0b43fc
|
| 3 |
+
size 2979206626
|