Update Phase 3 to next-token train-from-scratch with checkpoint auto-resume
Browse files
README.md
CHANGED
|
@@ -211,7 +211,8 @@ If a checkpoint exists, resume automatically; otherwise start from scratch.
|
|
| 211 |
```python
|
| 212 |
import torch
|
| 213 |
from pathlib import Path
|
| 214 |
-
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
|
|
|
|
| 215 |
|
| 216 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 217 |
"mineself2016/GeneMamba",
|
|
@@ -222,30 +223,31 @@ print("vocab_size:", tokenizer.vocab_size) # 25426
|
|
| 222 |
print("unk/pad:", tokenizer.unk_token_id, tokenizer.pad_token_id) # 0, 1
|
| 223 |
print("cls/mask:", tokenizer.cls_token_id, tokenizer.mask_token_id) # None, None
|
| 224 |
|
| 225 |
-
# Build model config
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
config =
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
num_hidden_layers=24,
|
| 233 |
-
max_position_embeddings=2048,
|
| 234 |
-
mamba_mode="mean",
|
| 235 |
-
)
|
| 236 |
|
| 237 |
# Resume if checkpoint exists
|
| 238 |
-
|
|
|
|
|
|
|
| 239 |
if checkpoint_dir.exists():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
model = AutoModelForMaskedLM.from_pretrained(
|
| 241 |
-
|
| 242 |
trust_remote_code=True,
|
| 243 |
local_files_only=True,
|
| 244 |
)
|
| 245 |
-
resume_from_checkpoint = str(checkpoint_dir)
|
| 246 |
else:
|
| 247 |
-
model =
|
| 248 |
-
resume_from_checkpoint = None
|
| 249 |
|
| 250 |
class NextTokenTrainer(Trainer):
|
| 251 |
def compute_loss(self, model, inputs, return_outputs=False):
|
|
@@ -262,7 +264,7 @@ class NextTokenTrainer(Trainer):
|
|
| 262 |
trainer = NextTokenTrainer(
|
| 263 |
model=model,
|
| 264 |
args=TrainingArguments(
|
| 265 |
-
output_dir=
|
| 266 |
num_train_epochs=3,
|
| 267 |
per_device_train_batch_size=32,
|
| 268 |
learning_rate=2e-5,
|
|
|
|
| 211 |
```python
|
| 212 |
import torch
|
| 213 |
from pathlib import Path
|
| 214 |
+
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM, Trainer, TrainingArguments
|
| 215 |
+
from transformers.trainer_utils import get_last_checkpoint
|
| 216 |
|
| 217 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 218 |
"mineself2016/GeneMamba",
|
|
|
|
| 223 |
print("unk/pad:", tokenizer.unk_token_id, tokenizer.pad_token_id) # 0, 1
|
| 224 |
print("cls/mask:", tokenizer.cls_token_id, tokenizer.mask_token_id) # None, None
|
| 225 |
|
| 226 |
+
# Build model config (no local modeling file import required)
|
| 227 |
+
config = AutoConfig.from_pretrained("mineself2016/GeneMamba", trust_remote_code=True)
|
| 228 |
+
config.vocab_size = 25426
|
| 229 |
+
config.hidden_size = 512
|
| 230 |
+
config.num_hidden_layers = 24
|
| 231 |
+
config.max_position_embeddings = 2048
|
| 232 |
+
config.mamba_mode = "mean"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
# Resume if checkpoint exists
|
| 235 |
+
output_dir = "./from_scratch_pretrain"
|
| 236 |
+
checkpoint_dir = Path(output_dir) / "checkpoint-last"
|
| 237 |
+
|
| 238 |
if checkpoint_dir.exists():
|
| 239 |
+
resume_from_checkpoint = str(checkpoint_dir)
|
| 240 |
+
else:
|
| 241 |
+
resume_from_checkpoint = get_last_checkpoint(output_dir)
|
| 242 |
+
|
| 243 |
+
if resume_from_checkpoint is not None:
|
| 244 |
model = AutoModelForMaskedLM.from_pretrained(
|
| 245 |
+
resume_from_checkpoint,
|
| 246 |
trust_remote_code=True,
|
| 247 |
local_files_only=True,
|
| 248 |
)
|
|
|
|
| 249 |
else:
|
| 250 |
+
model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True)
|
|
|
|
| 251 |
|
| 252 |
class NextTokenTrainer(Trainer):
|
| 253 |
def compute_loss(self, model, inputs, return_outputs=False):
|
|
|
|
| 264 |
trainer = NextTokenTrainer(
|
| 265 |
model=model,
|
| 266 |
args=TrainingArguments(
|
| 267 |
+
output_dir=output_dir,
|
| 268 |
num_train_epochs=3,
|
| 269 |
per_device_train_batch_size=32,
|
| 270 |
learning_rate=2e-5,
|