Upload 2 files
Browse files- gpt2_69m_cocomix.yaml +38 -0
- gpt2_69m_ntp.yaml +33 -0
gpt2_69m_cocomix.yaml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
|
| 3 |
+
mode: 'cocomix'
|
| 4 |
+
n_embd: 512
|
| 5 |
+
n_layer: 8
|
| 6 |
+
n_head: 8
|
| 7 |
+
compile_dynamo_cache_size_limit: 512
|
| 8 |
+
|
| 9 |
+
# optimization
|
| 10 |
+
lr: 6e-4
|
| 11 |
+
lr_schedule: 'cosine_with_min_lr' # 'cosine' 'constant_with_warmup' 'constant',
|
| 12 |
+
beta1: 0.9
|
| 13 |
+
beta2: 0.95
|
| 14 |
+
grad_clip_thresh: 1.
|
| 15 |
+
warmup_steps: 130
|
| 16 |
+
min_lr: 6e-5
|
| 17 |
+
eps: 1e-8
|
| 18 |
+
mixed_precision: null
|
| 19 |
+
weight_decay: 0.1
|
| 20 |
+
train_steps: 40000 # 40k steps ~ 20B
|
| 21 |
+
|
| 22 |
+
# total batch size = 1024 (context length) * 512 (update_batch_size) * 1 (grad_acc_steps) = (~0.5M)
|
| 23 |
+
# total number of tokens = train_steps * total batch size = 40k * 0.5M = 20B tokens
|
| 24 |
+
update_batch_size: 512 # micro batch size is update_batch_size // num_gpus
|
| 25 |
+
grad_acc_steps: 1
|
| 26 |
+
block_size: 1024
|
| 27 |
+
|
| 28 |
+
# saving/evaluation/logging frequency
|
| 29 |
+
save_step_freq: 2000
|
| 30 |
+
eval_step_freq: 1000
|
| 31 |
+
log_step_freq: 50
|
| 32 |
+
val_datasets: ['openwebtext'] # measuring ppl
|
| 33 |
+
batch_size_eval: 256
|
| 34 |
+
eval_limit: 1000
|
| 35 |
+
|
| 36 |
+
# sae
|
| 37 |
+
insert_layer_index: 3 # CoCoMix model's layer that predict and insert the concept
|
| 38 |
+
sae_layer_index: 5 # SAE layer that is used for concept extraction
|
gpt2_69m_ntp.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
|
| 3 |
+
mode: 'ntp'
|
| 4 |
+
n_embd: 624
|
| 5 |
+
n_layer: 8
|
| 6 |
+
n_head: 8
|
| 7 |
+
|
| 8 |
+
# optimization
|
| 9 |
+
lr: 6e-4
|
| 10 |
+
lr_schedule: 'cosine_with_min_lr' # 'cosine' 'constant_with_warmup' 'constant',
|
| 11 |
+
beta1: 0.9
|
| 12 |
+
beta2: 0.95
|
| 13 |
+
grad_clip_thresh: 1.
|
| 14 |
+
warmup_steps: 130
|
| 15 |
+
min_lr: 6e-5
|
| 16 |
+
eps: 1e-8
|
| 17 |
+
mixed_precision: null
|
| 18 |
+
weight_decay: 0.1
|
| 19 |
+
train_steps: 40000 # 40k steps ~ 20B
|
| 20 |
+
|
| 21 |
+
# total batch size = 1024 (context length) * 512 (update_batch_size) * 1 (grad_acc_steps) = (~0.5M)
|
| 22 |
+
# total number of tokens = train_steps * total batch size = 40k * 0.5M = 20B tokens
|
| 23 |
+
update_batch_size: 512 # micro batch size is update_batch_size // num_gpus
|
| 24 |
+
grad_acc_steps: 1
|
| 25 |
+
block_size: 1024
|
| 26 |
+
|
| 27 |
+
# saving/evaluation/logging frequency
|
| 28 |
+
save_step_freq: 2000
|
| 29 |
+
eval_step_freq: 1000
|
| 30 |
+
log_step_freq: 50
|
| 31 |
+
val_datasets: ['openwebtext'] # measuring ppl
|
| 32 |
+
batch_size_eval: 256
|
| 33 |
+
eval_limit: 1000
|