LisaMegaWatts commited on
Commit
969a1de
·
verified ·
1 Parent(s): a3d7dcd

Upload config.toml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.toml +47 -0
config.toml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 5M config — Chinchilla-optimal BPE training (~5M params)
2
+ # Target: 100M tokens at 20 tokens/param
3
+ # RTX 3060 12GB: batch=32, ctx=256 → 8192 tokens/step → ~12300 steps
4
+
5
+ [model]
6
+ arch = "transformer"
7
+ embed_dim = 256
8
+ n_layers = 6
9
+ n_heads = 4
10
+ head_dim = 64
11
+ ffn_mult = 4
12
+ context_length = 256
13
+ dropout = 0.0
14
+ bias = false
15
+ weight_tying = true
16
+
17
+ [training]
18
+ optimizer = "adamw"
19
+ lr = 6e-4
20
+ min_lr = 6e-5
21
+ warmup_steps = 500
22
+ max_steps = 12305
23
+ batch_size = 32
24
+ grad_clip = 1.0
25
+ precision = "f16"
26
+ eval_interval = 500
27
+ eval_steps = 25
28
+ checkpoint_interval = 2000
29
+ seed = 42
30
+
31
+ [training.curriculum]
32
+ enabled = false
33
+
34
+ [training.coreset]
35
+ enabled = false
36
+
37
+ [data]
38
+ train_path = "../text-pipeline/output/train.txt"
39
+ val_path = "../text-pipeline/output/val.txt"
40
+ tokenizer_dir = "../text-pipeline/output"
41
+
42
+ [inference]
43
+ precision = "f16"
44
+ compile = false
45
+ temperature = 0.8
46
+ top_k = 40
47
+ max_new_tokens = 500