pretrain core 1
Browse files- config-0.json +1 -1
- scripts/pretrain_core_model_0.yaml +12 -24
config-0.json
CHANGED
|
@@ -21,7 +21,7 @@
|
|
| 21 |
"rms_norm_eps": 1e-05,
|
| 22 |
"rope_scaling": null,
|
| 23 |
"rope_theta": 4300.0,
|
| 24 |
-
"tie_word_embeddings":
|
| 25 |
"torch_dtype": "bfloat16",
|
| 26 |
"transformers_version": "4.45.0.dev0",
|
| 27 |
"use_cache": true,
|
|
|
|
| 21 |
"rms_norm_eps": 1e-05,
|
| 22 |
"rope_scaling": null,
|
| 23 |
"rope_theta": 4300.0,
|
| 24 |
+
"tie_word_embeddings": false,
|
| 25 |
"torch_dtype": "bfloat16",
|
| 26 |
"transformers_version": "4.45.0.dev0",
|
| 27 |
"use_cache": true,
|
scripts/pretrain_core_model_0.yaml
CHANGED
|
@@ -61,7 +61,6 @@ train:
|
|
| 61 |
global_batch_size: 512
|
| 62 |
|
| 63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 64 |
-
# micro_batch_size: 2
|
| 65 |
micro_batch_size: 8
|
| 66 |
|
| 67 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
|
@@ -77,11 +76,10 @@ train:
|
|
| 77 |
max_steps:
|
| 78 |
|
| 79 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
| 80 |
-
# max_seq_length: 4096
|
| 81 |
max_seq_length: 1024
|
| 82 |
|
| 83 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
| 84 |
-
tie_embeddings:
|
| 85 |
|
| 86 |
# (type: Optional[float], default: 1.0)
|
| 87 |
max_norm: 1.0
|
|
@@ -107,22 +105,17 @@ eval:
|
|
| 107 |
final_validation: true
|
| 108 |
|
| 109 |
# Optimizer-related arguments
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
#
|
| 114 |
-
|
| 115 |
-
#
|
| 116 |
-
|
| 117 |
-
#
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
# weight_decay: 0.01
|
| 122 |
-
# # (type: tuple, default: (0.9,0.999))
|
| 123 |
-
# betas:
|
| 124 |
-
# - 0.9
|
| 125 |
-
# - 0.999
|
| 126 |
|
| 127 |
# optimizer:
|
| 128 |
# class_path: sophia_opt.SophiaG
|
|
@@ -134,11 +127,6 @@ eval:
|
|
| 134 |
# rho: 0.05
|
| 135 |
# weight_decay: 0.1
|
| 136 |
|
| 137 |
-
optimizer:
|
| 138 |
-
class_path: dolphinflow.DolphinFlow
|
| 139 |
-
init_args:
|
| 140 |
-
lr: 3e-4
|
| 141 |
-
|
| 142 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
| 143 |
devices: auto
|
| 144 |
|
|
|
|
| 61 |
global_batch_size: 512
|
| 62 |
|
| 63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
|
|
|
| 64 |
micro_batch_size: 8
|
| 65 |
|
| 66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
|
|
|
| 76 |
max_steps:
|
| 77 |
|
| 78 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
|
|
|
| 79 |
max_seq_length: 1024
|
| 80 |
|
| 81 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
| 82 |
+
tie_embeddings: false
|
| 83 |
|
| 84 |
# (type: Optional[float], default: 1.0)
|
| 85 |
max_norm: 1.0
|
|
|
|
| 105 |
final_validation: true
|
| 106 |
|
| 107 |
# Optimizer-related arguments
|
| 108 |
+
optimizer:
|
| 109 |
+
class_path: torch.optim.AdamW
|
| 110 |
+
init_args:
|
| 111 |
+
# (type: float, default: 0.001)
|
| 112 |
+
lr: 3e-4
|
| 113 |
+
# (type: float, default: 0.01)
|
| 114 |
+
weight_decay: 0.01
|
| 115 |
+
# (type: tuple, default: (0.9,0.999))
|
| 116 |
+
betas:
|
| 117 |
+
- 0.9
|
| 118 |
+
- 0.999
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
# optimizer:
|
| 121 |
# class_path: sophia_opt.SophiaG
|
|
|
|
| 127 |
# rho: 0.05
|
| 128 |
# weight_decay: 0.1
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
| 131 |
devices: auto
|
| 132 |
|