mineself2016
/

GeneMamba

@@ -20,8 +20,7 @@ A Hugging Face compatible implementation of GeneMamba, a foundational state-spac
 - [Quick Start](#quick-start)
   - [Phase 1: Extract Cell Embeddings](#phase-1-extract-cell-embeddings)
   - [Phase 2: Downstream Tasks](#phase-2-downstream-tasks)
-  - [Phase 3: Continue Pretraining](#phase-3-continue-pretraining)
-  - [Phase 4: Train from Scratch](#phase-4-train-from-scratch)
 - [Model Variants](#model-variants)
 - [Architecture](#architecture)
 - [Datasets](#datasets)
@@ -37,14 +36,14 @@ GeneMamba is a **state-space model (SSM)** based on **Mamba architecture** optim
 - **Takes ranked gene sequences** as input (genes sorted by expression level)
 - **Outputs cell embeddings** suitable for clustering, classification, and batch integration
-- **Supports multiple downstream tasks** including cell type annotation and masked LM pretraining
 - **Is compatible with Hugging Face Transformers** for easy integration into existing pipelines
 ### Key Features
 ✅ **Efficient Sequence Processing**: SSM-based architecture with linear complexity
 ✅ **Cell Representation Learning**: Direct cell embedding without intermediate steps
-✅ **Multi-task Support**: Classification, masked LM, and embeddings in one model
 ✅ **Hugging Face Integration**: Standard `from_pretrained()` and `save_pretrained()` interface
 ✅ **Production Ready**: Pretrained checkpoints available on Hugging Face Hub
@@ -204,117 +203,74 @@ The model also supports:
 ---
-### Phase 3: Continue Pretraining
-Fine-tune the model on your own single-cell data using **masked LM objective**:
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
-from torch.utils.data import Dataset
-# Load model for masked LM
 tokenizer = AutoTokenizer.from_pretrained(
     "mineself2016/GeneMamba",
-    trust_remote_code=True
-)
-model = AutoModelForMaskedLM.from_pretrained(
-    "mineself2016/GeneMamba",
-    trust_remote_code=True
 )
-print("vocab_size:", tokenizer.vocab_size)          # 25426
-print("unk_token/id:", tokenizer.unk_token, tokenizer.unk_token_id)  # [UNK], 0
-print("pad_token/id:", tokenizer.pad_token, tokenizer.pad_token_id)  # [PAD], 1
-print("cls_token/id:", tokenizer.cls_token, tokenizer.cls_token_id)  # None, None
-print("mask_token/id:", tokenizer.mask_token, tokenizer.mask_token_id)  # None, None
-# Important:
-# GeneMamba tokenizer defines only [UNK]=0 and [PAD]=1 as special tokens.
-# There is no built-in [CLS]/[SEP]/[MASK].
-# Your pretraining dataset (with input_ids only, no labels needed)
-class PretrainDataset(Dataset):
-    def __init__(self, input_ids_list):
-        self.input_ids_list = input_ids_list
-    def __len__(self):
-        return len(self.input_ids_list)
-    def __getitem__(self, idx):
-        return {"input_ids": self.input_ids_list[idx]}
-# Custom MLM collator (replace masked positions with [UNK], id=0)
-class GeneMambaMLMCollator:
-    def __init__(self, pad_token_id=1, unk_token_id=0, mlm_probability=0.15):
-        self.pad_token_id = pad_token_id
-        self.unk_token_id = unk_token_id
-        self.mlm_probability = mlm_probability
-    def __call__(self, features):
-        input_ids = torch.stack([f["input_ids"] for f in features])
-        labels = input_ids.clone()
-        prob = torch.full(labels.shape, self.mlm_probability)
-        mask_positions = torch.bernoulli(prob).bool()
-        mask_positions &= input_ids.ne(self.pad_token_id)
-        labels[~mask_positions] = -100
-        input_ids[mask_positions] = self.unk_token_id
-        return {"input_ids": input_ids, "labels": labels}
-data_collator = GeneMambaMLMCollator(
-    pad_token_id=tokenizer.pad_token_id,
-    unk_token_id=tokenizer.unk_token_id,
-    mlm_probability=0.15,
 )
-# Train
-trainer = Trainer(
     model=model,
     args=TrainingArguments(
-        output_dir="./pretrain_results",
         num_train_epochs=3,
         per_device_train_batch_size=32,
         learning_rate=2e-5,
     ),
     train_dataset=train_dataset,
-    data_collator=data_collator,
 )
-trainer.train()
-```
----
-### Phase 4: Train from Scratch
-Initialize and train a new GeneMamba model from scratch:
-```python
-import torch
-from transformers import AutoConfig, PreTrainedModel
-from transformers.utils.hub import register_and_push_to_hub_with_git_history
-# Create config
-config = AutoConfig.from_pretrained(
-    "mineself2016/GeneMamba",
-    trust_remote_code=True
-)
-# Modify hyperparameters if needed
-config.hidden_size = 512
-config.num_hidden_layers = 24
-config.vocab_size = 25426
-# Import and instantiate model
-from modeling_genemamba import GeneMambaForMaskedLM
-model = GeneMambaForMaskedLM(config)
-print(f"Total parameters: {model.num_parameters() / 1e9:.2f}B")
-# Now proceed with training as in Phase 3
 ```
 ---
@@ -380,7 +336,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
     "mineself2016/GeneMamba", num_labels=10, trust_remote_code=True
 )
-# Masked LM
 from transformers import AutoModelForMaskedLM
 model = AutoModelForMaskedLM.from_pretrained("mineself2016/GeneMamba", trust_remote_code=True)
 ```
@@ -476,11 +432,9 @@ input_ids = tokenizer(gene_ids, return_tensors="pt", padding=True)["input_ids"]
 See the `examples/` directory for complete scripts:
-- `00_preprocess_to_input_ids.py` - h5ad to ranked gene token IDs
-- `01_extract_embeddings.py` - Extract cell embeddings
-- `10_finetune_classification.py` - Cell type annotation
-- `20_continue_pretraining_reference.py` - Domain adaptation
-- `21_pretrain_from_scratch_reference.py` - Training from scratch
 Run any example:

 - [Quick Start](#quick-start)
   - [Phase 1: Extract Cell Embeddings](#phase-1-extract-cell-embeddings)
   - [Phase 2: Downstream Tasks](#phase-2-downstream-tasks)
+    - [Phase 3: Train from Scratch](#phase-3-train-from-scratch)
 - [Model Variants](#model-variants)
 - [Architecture](#architecture)
 - [Datasets](#datasets)
 - **Takes ranked gene sequences** as input (genes sorted by expression level)
 - **Outputs cell embeddings** suitable for clustering, classification, and batch integration
+- **Supports multiple downstream tasks** including cell type annotation and next-token pretraining
 - **Is compatible with Hugging Face Transformers** for easy integration into existing pipelines
 ### Key Features
 ✅ **Efficient Sequence Processing**: SSM-based architecture with linear complexity
 ✅ **Cell Representation Learning**: Direct cell embedding without intermediate steps
+✅ **Multi-task Support**: Classification, next-token pretraining, and embeddings in one model
 ✅ **Hugging Face Integration**: Standard `from_pretrained()` and `save_pretrained()` interface
 ✅ **Production Ready**: Pretrained checkpoints available on Hugging Face Hub
 ---
+### Phase 3: Train from Scratch
+Train a new GeneMamba model with **next-token prediction**.
+If a checkpoint exists, resume automatically; otherwise start from scratch.
 ```python
 import torch
+from pathlib import Path
 from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
 tokenizer = AutoTokenizer.from_pretrained(
     "mineself2016/GeneMamba",
+    trust_remote_code=True,
 )
+print("vocab_size:", tokenizer.vocab_size)  # 25426
+print("unk/pad:", tokenizer.unk_token_id, tokenizer.pad_token_id)  # 0, 1
+print("cls/mask:", tokenizer.cls_token_id, tokenizer.mask_token_id)  # None, None
+# Build model config
+from configuration_genemamba import GeneMambaConfig
+from modeling_genemamba import GeneMambaForMaskedLM
+config = GeneMambaConfig(
+    vocab_size=25426,
+    hidden_size=512,
+    num_hidden_layers=24,
+    max_position_embeddings=2048,
+    mamba_mode="mean",
 )
+# Resume if checkpoint exists
+checkpoint_dir = Path("./from_scratch_pretrain/checkpoint-last")
+if checkpoint_dir.exists():
+    model = AutoModelForMaskedLM.from_pretrained(
+        str(checkpoint_dir),
+        trust_remote_code=True,
+        local_files_only=True,
+    )
+    resume_from_checkpoint = str(checkpoint_dir)
+else:
+    model = GeneMambaForMaskedLM(config)
+    resume_from_checkpoint = None
+class NextTokenTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        input_ids = inputs["input_ids"]
+        logits = model(input_ids=input_ids).logits
+        shift_logits = logits[:, :-1, :].contiguous()
+        shift_labels = input_ids[:, 1:].contiguous().to(shift_logits.device)
+        loss = torch.nn.functional.cross_entropy(
+            shift_logits.view(-1, shift_logits.size(-1)),
+            shift_labels.view(-1),
+        )
+        return loss
+trainer = NextTokenTrainer(
     model=model,
     args=TrainingArguments(
+        output_dir="./from_scratch_pretrain",
         num_train_epochs=3,
         per_device_train_batch_size=32,
         learning_rate=2e-5,
     ),
     train_dataset=train_dataset,
 )
+trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 ```
 ---
     "mineself2016/GeneMamba", num_labels=10, trust_remote_code=True
 )
+# Language modeling head (used with next-token objective)
 from transformers import AutoModelForMaskedLM
 model = AutoModelForMaskedLM.from_pretrained("mineself2016/GeneMamba", trust_remote_code=True)
 ```
 See the `examples/` directory for complete scripts:
+- `1_extract_embeddings.py` - Extract cell embeddings
+- `2_finetune_classification.py` - Cell type annotation
+- `4_pretrain_from_scratch.py` - Train from scratch (next-token + optional resume)
 Run any example: