Upload config.yaml
Browse files- config.yaml +79 -0
config.yaml
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_log: true
|
| 2 |
+
wandb_entity: null
|
| 3 |
+
wandb_project: null
|
| 4 |
+
wandb_key: null
|
| 5 |
+
|
| 6 |
+
defaults:
|
| 7 |
+
- _self_
|
| 8 |
+
- setup: 'gpt2_69m'
|
| 9 |
+
|
| 10 |
+
hydra:
|
| 11 |
+
run:
|
| 12 |
+
dir: .
|
| 13 |
+
|
| 14 |
+
mode: 'ntp'
|
| 15 |
+
seed: 22
|
| 16 |
+
rank: 0
|
| 17 |
+
suffix: null
|
| 18 |
+
|
| 19 |
+
# model
|
| 20 |
+
base_model: 'openai-community/gpt2'
|
| 21 |
+
pretrained_model: 'openai-community/gpt2'
|
| 22 |
+
dataset: openwebtext
|
| 23 |
+
data_dir: './data/openwebtext_preprocess' # set your data path
|
| 24 |
+
n_embd: null
|
| 25 |
+
n_layer: null
|
| 26 |
+
n_head: null
|
| 27 |
+
vocab_size: null
|
| 28 |
+
|
| 29 |
+
load_path: null
|
| 30 |
+
port: 9819
|
| 31 |
+
distributed: False
|
| 32 |
+
world_size: 1
|
| 33 |
+
use_torch_compile: True
|
| 34 |
+
compile_dynamo_cache_size_limit: 256
|
| 35 |
+
|
| 36 |
+
# optimization
|
| 37 |
+
lr: 6e-4
|
| 38 |
+
lr_schedule: 'cosine_with_min_lr' # 'cosine' 'constant_with_warmup' 'constant',
|
| 39 |
+
beta1: 0.9
|
| 40 |
+
beta2: 0.95
|
| 41 |
+
grad_clip_thresh: 1.
|
| 42 |
+
warmup_steps: 2000
|
| 43 |
+
min_lr: 6e-5
|
| 44 |
+
eps: 1e-8
|
| 45 |
+
mixed_precision: null
|
| 46 |
+
weight_decay: 0.1
|
| 47 |
+
train_steps: 600000 # 600k steps
|
| 48 |
+
n_epochs: 0
|
| 49 |
+
num_workers: 2
|
| 50 |
+
|
| 51 |
+
# total batch size = 1024 (context length) * 64 (update_batch_size) * 8 (grad_acc_steps) = 524,288 (~0.5M)
|
| 52 |
+
# total number of tokens = train_steps * total batch size = 600k * 0.5M = 300B tokens
|
| 53 |
+
update_batch_size: 256 # micro batch size is update_batch_size // num_gpus
|
| 54 |
+
grad_acc_steps: 2
|
| 55 |
+
block_size: 1024 # context length
|
| 56 |
+
dropout: 0.0
|
| 57 |
+
bias: False
|
| 58 |
+
|
| 59 |
+
log_path: null
|
| 60 |
+
use_accelerator: True
|
| 61 |
+
|
| 62 |
+
# saving/evaluation/logging frequency
|
| 63 |
+
save_step_freq: 10000
|
| 64 |
+
eval_step_freq: 1000
|
| 65 |
+
log_step_freq: 50
|
| 66 |
+
global_step: 0
|
| 67 |
+
val_datasets: ['openwebtext'] # measuring ppl
|
| 68 |
+
batch_size_eval: 256
|
| 69 |
+
eval_limit: 1000
|
| 70 |
+
|
| 71 |
+
topK_attri: 4 # TopK for concept label
|
| 72 |
+
concept_num: 32 # TopK for SAE activation
|
| 73 |
+
concept_dim: 32768 # SAE concept dimention
|
| 74 |
+
|
| 75 |
+
# sae
|
| 76 |
+
sae_location: 'resid_post_mlp'
|
| 77 |
+
insert_layer_index: null # CoCoMix model's layer that predict and insert the concept
|
| 78 |
+
sae_layer_index: null # SAE layer that is used for concept extraction
|
| 79 |
+
lam_concept: 0.1
|