tangledgroup
/

tangled-alpha-0.1-core

Text Generation

Model card Files Files and versions

mtasic85 commited on Feb 24, 2025

Commit

1c9b116

·

1 Parent(s): 056e2c6

micro_batch_size: 4

Files changed (1) hide show

scripts/pretrain-core-model.yaml +4 -4

scripts/pretrain-core-model.yaml CHANGED Viewed

@@ -67,9 +67,9 @@ train:
   # global_batch_size: 256
   # Number of samples per data-parallel rank (type: int, default: 4)
-  # micro_batch_size: 4
   # micro_batch_size: 2
-  micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 200
@@ -115,11 +115,11 @@ eval:
 # Optimizer-related arguments
 optimizer:
-  class_path: torch.optim.AdamW
   # class_path: torchao.prototype.low_bit_optim.AdamW8bit
   # class_path: torchao.prototype.low_bit_optim.AdamW4bit
   # class_path: bitsandbytes.optim.AdamW8bit
-  # class_path: bitsandbytes.optim.PagedAdamW8bit
   init_args:
     # (type: float, default: 0.001)
     lr: 1e-4

   # global_batch_size: 256
   # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 4
   # micro_batch_size: 2
+  # micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 200
 # Optimizer-related arguments
 optimizer:
+  # class_path: torch.optim.AdamW
   # class_path: torchao.prototype.low_bit_optim.AdamW8bit
   # class_path: torchao.prototype.low_bit_optim.AdamW4bit
   # class_path: bitsandbytes.optim.AdamW8bit
+  class_path: bitsandbytes.optim.PagedAdamW8bit
   init_args:
     # (type: float, default: 0.001)
     lr: 1e-4