Anshler commited on
Commit
85b7ffb
·
verified ·
1 Parent(s): 523e45a

Upload 2 files

Browse files
Files changed (2) hide show
  1. gpt2_69m_cocomix.yaml +38 -0
  2. gpt2_69m_ntp.yaml +33 -0
gpt2_69m_cocomix.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ mode: 'cocomix'
4
+ n_embd: 512
5
+ n_layer: 8
6
+ n_head: 8
7
+ compile_dynamo_cache_size_limit: 512
8
+
9
+ # optimization
10
+ lr: 6e-4
11
+ lr_schedule: 'cosine_with_min_lr' # 'cosine' 'constant_with_warmup' 'constant',
12
+ beta1: 0.9
13
+ beta2: 0.95
14
+ grad_clip_thresh: 1.
15
+ warmup_steps: 130
16
+ min_lr: 6e-5
17
+ eps: 1e-8
18
+ mixed_precision: null
19
+ weight_decay: 0.1
20
+ train_steps: 40000 # 40k steps ~ 20B
21
+
22
+ # total batch size = 1024 (context length) * 512 (update_batch_size) * 1 (grad_acc_steps) = (~0.5M)
23
+ # total number of tokens = train_steps * total batch size = 40k * 0.5M = 20B tokens
24
+ update_batch_size: 512 # micro batch size is update_batch_size // num_gpus
25
+ grad_acc_steps: 1
26
+ block_size: 1024
27
+
28
+ # saving/evaluation/logging frequency
29
+ save_step_freq: 2000
30
+ eval_step_freq: 1000
31
+ log_step_freq: 50
32
+ val_datasets: ['openwebtext'] # measuring ppl
33
+ batch_size_eval: 256
34
+ eval_limit: 1000
35
+
36
+ # sae
37
+ insert_layer_index: 3 # CoCoMix model's layer that predict and insert the concept
38
+ sae_layer_index: 5 # SAE layer that is used for concept extraction
gpt2_69m_ntp.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ mode: 'ntp'
4
+ n_embd: 624
5
+ n_layer: 8
6
+ n_head: 8
7
+
8
+ # optimization
9
+ lr: 6e-4
10
+ lr_schedule: 'cosine_with_min_lr' # 'cosine' 'constant_with_warmup' 'constant',
11
+ beta1: 0.9
12
+ beta2: 0.95
13
+ grad_clip_thresh: 1.
14
+ warmup_steps: 130
15
+ min_lr: 6e-5
16
+ eps: 1e-8
17
+ mixed_precision: null
18
+ weight_decay: 0.1
19
+ train_steps: 40000 # 40k steps ~ 20B
20
+
21
+ # total batch size = 1024 (context length) * 512 (update_batch_size) * 1 (grad_acc_steps) = (~0.5M)
22
+ # total number of tokens = train_steps * total batch size = 40k * 0.5M = 20B tokens
23
+ update_batch_size: 512 # micro batch size is update_batch_size // num_gpus
24
+ grad_acc_steps: 1
25
+ block_size: 1024
26
+
27
+ # saving/evaluation/logging frequency
28
+ save_step_freq: 2000
29
+ eval_step_freq: 1000
30
+ log_step_freq: 50
31
+ val_datasets: ['openwebtext'] # measuring ppl
32
+ batch_size_eval: 256
33
+ eval_limit: 1000