Anshler commited on
Commit
7d80fda
·
verified ·
1 Parent(s): 85b7ffb

Upload config.yaml

Browse files
Files changed (1) hide show
  1. config.yaml +79 -0
config.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_log: true
2
+ wandb_entity: null
3
+ wandb_project: null
4
+ wandb_key: null
5
+
6
+ defaults:
7
+ - _self_
8
+ - setup: 'gpt2_69m'
9
+
10
+ hydra:
11
+ run:
12
+ dir: .
13
+
14
+ mode: 'ntp'
15
+ seed: 22
16
+ rank: 0
17
+ suffix: null
18
+
19
+ # model
20
+ base_model: 'openai-community/gpt2'
21
+ pretrained_model: 'openai-community/gpt2'
22
+ dataset: openwebtext
23
+ data_dir: './data/openwebtext_preprocess' # set your data path
24
+ n_embd: null
25
+ n_layer: null
26
+ n_head: null
27
+ vocab_size: null
28
+
29
+ load_path: null
30
+ port: 9819
31
+ distributed: False
32
+ world_size: 1
33
+ use_torch_compile: True
34
+ compile_dynamo_cache_size_limit: 256
35
+
36
+ # optimization
37
+ lr: 6e-4
38
+ lr_schedule: 'cosine_with_min_lr' # 'cosine' 'constant_with_warmup' 'constant',
39
+ beta1: 0.9
40
+ beta2: 0.95
41
+ grad_clip_thresh: 1.
42
+ warmup_steps: 2000
43
+ min_lr: 6e-5
44
+ eps: 1e-8
45
+ mixed_precision: null
46
+ weight_decay: 0.1
47
+ train_steps: 600000 # 600k steps
48
+ n_epochs: 0
49
+ num_workers: 2
50
+
51
+ # total batch size = 1024 (context length) * 64 (update_batch_size) * 8 (grad_acc_steps) = 524,288 (~0.5M)
52
+ # total number of tokens = train_steps * total batch size = 600k * 0.5M = 300B tokens
53
+ update_batch_size: 256 # micro batch size is update_batch_size // num_gpus
54
+ grad_acc_steps: 2
55
+ block_size: 1024 # context length
56
+ dropout: 0.0
57
+ bias: False
58
+
59
+ log_path: null
60
+ use_accelerator: True
61
+
62
+ # saving/evaluation/logging frequency
63
+ save_step_freq: 10000
64
+ eval_step_freq: 1000
65
+ log_step_freq: 50
66
+ global_step: 0
67
+ val_datasets: ['openwebtext'] # measuring ppl
68
+ batch_size_eval: 256
69
+ eval_limit: 1000
70
+
71
+ topK_attri: 4 # TopK for concept label
72
+ concept_num: 32 # TopK for SAE activation
73
+ concept_dim: 32768 # SAE concept dimention
74
+
75
+ # sae
76
+ sae_location: 'resid_post_mlp'
77
+ insert_layer_index: null # CoCoMix model's layer that predict and insert the concept
78
+ sae_layer_index: null # SAE layer that is used for concept extraction
79
+ lam_concept: 0.1