micro_batch_size: 4
Browse files
scripts/pretrain-core-model.yaml
CHANGED
|
@@ -67,9 +67,9 @@ train:
|
|
| 67 |
# global_batch_size: 256
|
| 68 |
|
| 69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 70 |
-
|
| 71 |
# micro_batch_size: 2
|
| 72 |
-
micro_batch_size: 1
|
| 73 |
|
| 74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
| 75 |
lr_warmup_steps: 200
|
|
@@ -115,11 +115,11 @@ eval:
|
|
| 115 |
# Optimizer-related arguments
|
| 116 |
|
| 117 |
optimizer:
|
| 118 |
-
class_path: torch.optim.AdamW
|
| 119 |
# class_path: torchao.prototype.low_bit_optim.AdamW8bit
|
| 120 |
# class_path: torchao.prototype.low_bit_optim.AdamW4bit
|
| 121 |
# class_path: bitsandbytes.optim.AdamW8bit
|
| 122 |
-
|
| 123 |
init_args:
|
| 124 |
# (type: float, default: 0.001)
|
| 125 |
lr: 1e-4
|
|
|
|
| 67 |
# global_batch_size: 256
|
| 68 |
|
| 69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 70 |
+
micro_batch_size: 4
|
| 71 |
# micro_batch_size: 2
|
| 72 |
+
# micro_batch_size: 1
|
| 73 |
|
| 74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
| 75 |
lr_warmup_steps: 200
|
|
|
|
| 115 |
# Optimizer-related arguments
|
| 116 |
|
| 117 |
optimizer:
|
| 118 |
+
# class_path: torch.optim.AdamW
|
| 119 |
# class_path: torchao.prototype.low_bit_optim.AdamW8bit
|
| 120 |
# class_path: torchao.prototype.low_bit_optim.AdamW4bit
|
| 121 |
# class_path: bitsandbytes.optim.AdamW8bit
|
| 122 |
+
class_path: bitsandbytes.optim.PagedAdamW8bit
|
| 123 |
init_args:
|
| 124 |
# (type: float, default: 0.001)
|
| 125 |
lr: 1e-4
|