mineself2016 commited on
Commit
f141719
·
verified ·
1 Parent(s): ec0b6f5

Update Phase 3 to next-token train-from-scratch with checkpoint auto-resume

Browse files
Files changed (1) hide show
  1. README.md +20 -18
README.md CHANGED
@@ -211,7 +211,8 @@ If a checkpoint exists, resume automatically; otherwise start from scratch.
211
  ```python
212
  import torch
213
  from pathlib import Path
214
- from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
 
215
 
216
  tokenizer = AutoTokenizer.from_pretrained(
217
  "mineself2016/GeneMamba",
@@ -222,30 +223,31 @@ print("vocab_size:", tokenizer.vocab_size) # 25426
222
  print("unk/pad:", tokenizer.unk_token_id, tokenizer.pad_token_id) # 0, 1
223
  print("cls/mask:", tokenizer.cls_token_id, tokenizer.mask_token_id) # None, None
224
 
225
- # Build model config
226
- from configuration_genemamba import GeneMambaConfig
227
- from modeling_genemamba import GeneMambaForMaskedLM
228
-
229
- config = GeneMambaConfig(
230
- vocab_size=25426,
231
- hidden_size=512,
232
- num_hidden_layers=24,
233
- max_position_embeddings=2048,
234
- mamba_mode="mean",
235
- )
236
 
237
  # Resume if checkpoint exists
238
- checkpoint_dir = Path("./from_scratch_pretrain/checkpoint-last")
 
 
239
  if checkpoint_dir.exists():
 
 
 
 
 
240
  model = AutoModelForMaskedLM.from_pretrained(
241
- str(checkpoint_dir),
242
  trust_remote_code=True,
243
  local_files_only=True,
244
  )
245
- resume_from_checkpoint = str(checkpoint_dir)
246
  else:
247
- model = GeneMambaForMaskedLM(config)
248
- resume_from_checkpoint = None
249
 
250
  class NextTokenTrainer(Trainer):
251
  def compute_loss(self, model, inputs, return_outputs=False):
@@ -262,7 +264,7 @@ class NextTokenTrainer(Trainer):
262
  trainer = NextTokenTrainer(
263
  model=model,
264
  args=TrainingArguments(
265
- output_dir="./from_scratch_pretrain",
266
  num_train_epochs=3,
267
  per_device_train_batch_size=32,
268
  learning_rate=2e-5,
 
211
  ```python
212
  import torch
213
  from pathlib import Path
214
+ from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM, Trainer, TrainingArguments
215
+ from transformers.trainer_utils import get_last_checkpoint
216
 
217
  tokenizer = AutoTokenizer.from_pretrained(
218
  "mineself2016/GeneMamba",
 
223
  print("unk/pad:", tokenizer.unk_token_id, tokenizer.pad_token_id) # 0, 1
224
  print("cls/mask:", tokenizer.cls_token_id, tokenizer.mask_token_id) # None, None
225
 
226
+ # Build model config (no local modeling file import required)
227
+ config = AutoConfig.from_pretrained("mineself2016/GeneMamba", trust_remote_code=True)
228
+ config.vocab_size = 25426
229
+ config.hidden_size = 512
230
+ config.num_hidden_layers = 24
231
+ config.max_position_embeddings = 2048
232
+ config.mamba_mode = "mean"
 
 
 
 
233
 
234
  # Resume if checkpoint exists
235
+ output_dir = "./from_scratch_pretrain"
236
+ checkpoint_dir = Path(output_dir) / "checkpoint-last"
237
+
238
  if checkpoint_dir.exists():
239
+ resume_from_checkpoint = str(checkpoint_dir)
240
+ else:
241
+ resume_from_checkpoint = get_last_checkpoint(output_dir)
242
+
243
+ if resume_from_checkpoint is not None:
244
  model = AutoModelForMaskedLM.from_pretrained(
245
+ resume_from_checkpoint,
246
  trust_remote_code=True,
247
  local_files_only=True,
248
  )
 
249
  else:
250
+ model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True)
 
251
 
252
  class NextTokenTrainer(Trainer):
253
  def compute_loss(self, model, inputs, return_outputs=False):
 
264
  trainer = NextTokenTrainer(
265
  model=model,
266
  args=TrainingArguments(
267
+ output_dir=output_dir,
268
  num_train_epochs=3,
269
  per_device_train_batch_size=32,
270
  learning_rate=2e-5,