lpkphd commited on
Commit
280d567
·
verified ·
1 Parent(s): 84580ce

Upload gpt2_reference/config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. gpt2_reference/config.yaml +25 -14
gpt2_reference/config.yaml CHANGED
@@ -1,37 +1,48 @@
1
- # GPT-2 124M Reference — reproduce original GPT-2 for baseline comparison
2
- # Standard MHA + GELU FFN + LayerNorm
3
-
4
  d_model: 768
5
  n_layers: 12
6
  vocab_size: 50257
7
-
8
  attention:
9
  type: mha
10
  n_heads: 12
 
 
 
 
11
  dropout: 0.0
12
-
13
  ffn:
14
  type: gelu
 
 
 
 
 
15
  dropout: 0.0
16
-
17
  position:
18
- type: rope # Upgrade from sinusoidal for fair comparison
19
  max_seq_len: 1024
20
  rope_base: 10000.0
21
-
22
  norm:
23
  type: layernorm
24
- eps: 1.0e-5
25
-
26
  output:
27
  type: tied
28
-
29
  training:
30
- batch_size: 64
31
  seq_len: 1024
32
- lr: 6.0e-4
33
- min_lr: 6.0e-5
34
  warmup_steps: 1000
35
  max_steps: 50000
36
  weight_decay: 0.1
37
  grad_clip: 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  d_model: 768
2
  n_layers: 12
3
  vocab_size: 50257
 
4
  attention:
5
  type: mha
6
  n_heads: 12
7
+ n_kv_heads: null
8
+ kv_compression_dim: 256
9
+ q_compression_dim: 384
10
+ rope_dim: 64
11
  dropout: 0.0
 
12
  ffn:
13
  type: gelu
14
+ hidden_mult: 2.6666666666666665
15
+ n_experts: 8
16
+ top_k: 2
17
+ shared_experts: 1
18
+ load_balance_weight: 0.01
19
  dropout: 0.0
 
20
  position:
21
+ type: rope
22
  max_seq_len: 1024
23
  rope_base: 10000.0
24
+ rope_dim: null
25
  norm:
26
  type: layernorm
27
+ eps: 1.0e-05
 
28
  output:
29
  type: tied
 
30
  training:
31
+ batch_size: 8
32
  seq_len: 1024
33
+ lr: 0.0006
34
+ min_lr: 6.0e-05
35
  warmup_steps: 1000
36
  max_steps: 50000
37
  weight_decay: 0.1
38
  grad_clip: 1.0
39
+ beta1: 0.9
40
+ beta2: 0.95
41
+ dataset: HuggingFaceFW/fineweb-edu
42
+ tokenizer: gpt2
43
+ log_interval: 10
44
+ eval_interval: 500
45
+ save_interval: 2500
46
+ eval_steps: 100
47
+ embed_dropout: 0.0
48
+ residual_dropout: 0.0