Sync README model card from project root
Browse files
README.md
CHANGED
|
@@ -101,11 +101,11 @@ from transformers import AutoTokenizer, AutoModel
|
|
| 101 |
|
| 102 |
# Load pretrained model and tokenizer
|
| 103 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 104 |
-
"
|
| 105 |
trust_remote_code=True
|
| 106 |
)
|
| 107 |
model = AutoModel.from_pretrained(
|
| 108 |
-
"
|
| 109 |
trust_remote_code=True
|
| 110 |
)
|
| 111 |
|
|
@@ -151,7 +151,7 @@ from torch.utils.data import Dataset
|
|
| 151 |
|
| 152 |
# Load model with classification head
|
| 153 |
model = AutoModelForSequenceClassification.from_pretrained(
|
| 154 |
-
"
|
| 155 |
num_labels=10, # number of cell types
|
| 156 |
trust_remote_code=True
|
| 157 |
)
|
|
@@ -215,7 +215,7 @@ from torch.utils.data import Dataset
|
|
| 215 |
|
| 216 |
# Load model for masked LM
|
| 217 |
model = AutoModelForMaskedLM.from_pretrained(
|
| 218 |
-
"
|
| 219 |
trust_remote_code=True
|
| 220 |
)
|
| 221 |
|
|
@@ -266,7 +266,7 @@ from transformers.utils.hub import register_and_push_to_hub_with_git_history
|
|
| 266 |
|
| 267 |
# Create config
|
| 268 |
config = AutoConfig.from_pretrained(
|
| 269 |
-
"
|
| 270 |
trust_remote_code=True
|
| 271 |
)
|
| 272 |
|
|
@@ -340,17 +340,27 @@ Task-Specific Heads:
|
|
| 340 |
```python
|
| 341 |
# Standard loading (backbone only)
|
| 342 |
from transformers import AutoModel
|
| 343 |
-
model = AutoModel.from_pretrained("
|
| 344 |
|
| 345 |
# Classification
|
| 346 |
from transformers import AutoModelForSequenceClassification
|
| 347 |
model = AutoModelForSequenceClassification.from_pretrained(
|
| 348 |
-
"
|
| 349 |
)
|
| 350 |
|
| 351 |
# Masked LM
|
| 352 |
from transformers import AutoModelForMaskedLM
|
| 353 |
-
model = AutoModelForMaskedLM.from_pretrained("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
```
|
| 355 |
|
| 356 |
### Saving Models
|
|
@@ -375,7 +385,7 @@ All hyperparameters are stored in `config.json`:
|
|
| 375 |
"hidden_size": 512,
|
| 376 |
"num_hidden_layers": 24,
|
| 377 |
"vocab_size": 25426,
|
| 378 |
-
|
| 379 |
"embedding_pooling": "mean"
|
| 380 |
}
|
| 381 |
```
|
|
@@ -434,15 +444,16 @@ input_ids = tokenizer(gene_ids, return_tensors="pt", padding=True)["input_ids"]
|
|
| 434 |
|
| 435 |
See the `examples/` directory for complete scripts:
|
| 436 |
|
| 437 |
-
- `
|
| 438 |
-
- `
|
| 439 |
-
- `
|
| 440 |
-
- `
|
|
|
|
| 441 |
|
| 442 |
Run any example:
|
| 443 |
|
| 444 |
```bash
|
| 445 |
-
python examples/
|
| 446 |
```
|
| 447 |
|
| 448 |
---
|
|
@@ -471,6 +482,25 @@ This is expected for custom models. Either:
|
|
| 471 |
1. Set `trust_remote_code=True` (safe if loading from official repo)
|
| 472 |
2. Or use `sys.path.insert(0, '.')` if loading local code
|
| 473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
### Out of Memory (OOM)
|
| 475 |
|
| 476 |
Reduce batch size:
|
|
|
|
| 101 |
|
| 102 |
# Load pretrained model and tokenizer
|
| 103 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 104 |
+
"mineself2016/GeneMamba",
|
| 105 |
trust_remote_code=True
|
| 106 |
)
|
| 107 |
model = AutoModel.from_pretrained(
|
| 108 |
+
"mineself2016/GeneMamba",
|
| 109 |
trust_remote_code=True
|
| 110 |
)
|
| 111 |
|
|
|
|
| 151 |
|
| 152 |
# Load model with classification head
|
| 153 |
model = AutoModelForSequenceClassification.from_pretrained(
|
| 154 |
+
"mineself2016/GeneMamba",
|
| 155 |
num_labels=10, # number of cell types
|
| 156 |
trust_remote_code=True
|
| 157 |
)
|
|
|
|
| 215 |
|
| 216 |
# Load model for masked LM
|
| 217 |
model = AutoModelForMaskedLM.from_pretrained(
|
| 218 |
+
"mineself2016/GeneMamba",
|
| 219 |
trust_remote_code=True
|
| 220 |
)
|
| 221 |
|
|
|
|
| 266 |
|
| 267 |
# Create config
|
| 268 |
config = AutoConfig.from_pretrained(
|
| 269 |
+
"mineself2016/GeneMamba",
|
| 270 |
trust_remote_code=True
|
| 271 |
)
|
| 272 |
|
|
|
|
| 340 |
```python
|
| 341 |
# Standard loading (backbone only)
|
| 342 |
from transformers import AutoModel
|
| 343 |
+
model = AutoModel.from_pretrained("mineself2016/GeneMamba", trust_remote_code=True)
|
| 344 |
|
| 345 |
# Classification
|
| 346 |
from transformers import AutoModelForSequenceClassification
|
| 347 |
model = AutoModelForSequenceClassification.from_pretrained(
|
| 348 |
+
"mineself2016/GeneMamba", num_labels=10, trust_remote_code=True
|
| 349 |
)
|
| 350 |
|
| 351 |
# Masked LM
|
| 352 |
from transformers import AutoModelForMaskedLM
|
| 353 |
+
model = AutoModelForMaskedLM.from_pretrained("mineself2016/GeneMamba", trust_remote_code=True)
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
Load other model sizes from subfolders:
|
| 357 |
+
|
| 358 |
+
```python
|
| 359 |
+
model_24l_768d = AutoModel.from_pretrained(
|
| 360 |
+
"mineself2016/GeneMamba",
|
| 361 |
+
subfolder="24l-768d",
|
| 362 |
+
trust_remote_code=True,
|
| 363 |
+
)
|
| 364 |
```
|
| 365 |
|
| 366 |
### Saving Models
|
|
|
|
| 385 |
"hidden_size": 512,
|
| 386 |
"num_hidden_layers": 24,
|
| 387 |
"vocab_size": 25426,
|
| 388 |
+
"mamba_mode": "mean",
|
| 389 |
"embedding_pooling": "mean"
|
| 390 |
}
|
| 391 |
```
|
|
|
|
| 444 |
|
| 445 |
See the `examples/` directory for complete scripts:
|
| 446 |
|
| 447 |
+
- `00_preprocess_to_input_ids.py` - h5ad to ranked gene token IDs
|
| 448 |
+
- `01_extract_embeddings.py` - Extract cell embeddings
|
| 449 |
+
- `10_finetune_classification.py` - Cell type annotation
|
| 450 |
+
- `20_continue_pretraining_reference.py` - Domain adaptation
|
| 451 |
+
- `21_pretrain_from_scratch_reference.py` - Training from scratch
|
| 452 |
|
| 453 |
Run any example:
|
| 454 |
|
| 455 |
```bash
|
| 456 |
+
python examples/01_extract_embeddings.py
|
| 457 |
```
|
| 458 |
|
| 459 |
---
|
|
|
|
| 482 |
1. Set `trust_remote_code=True` (safe if loading from official repo)
|
| 483 |
2. Or use `sys.path.insert(0, '.')` if loading local code
|
| 484 |
|
| 485 |
+
### Old Cached Code / Shape Mismatch
|
| 486 |
+
|
| 487 |
+
If you still see old loading errors after an update, force refresh files from Hub:
|
| 488 |
+
|
| 489 |
+
```python
|
| 490 |
+
from transformers import AutoModel
|
| 491 |
+
model = AutoModel.from_pretrained(
|
| 492 |
+
"mineself2016/GeneMamba",
|
| 493 |
+
trust_remote_code=True,
|
| 494 |
+
force_download=True,
|
| 495 |
+
)
|
| 496 |
+
```
|
| 497 |
+
|
| 498 |
+
You can also clear local cache if needed:
|
| 499 |
+
|
| 500 |
+
```bash
|
| 501 |
+
rm -rf ~/.cache/huggingface/hub/models--mineself2016--GeneMamba
|
| 502 |
+
```
|
| 503 |
+
|
| 504 |
### Out of Memory (OOM)
|
| 505 |
|
| 506 |
Reduce batch size:
|