LisaMegaWatts commited on
Commit
a61fa31
·
verified ·
1 Parent(s): aa3d3f0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  5m-chinchilla/final.jld2 filter=lfs diff=lfs merge=lfs -text
37
  5m-chinchilla/step_12000.jld2 filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  5m-chinchilla/final.jld2 filter=lfs diff=lfs merge=lfs -text
37
  5m-chinchilla/step_12000.jld2 filter=lfs diff=lfs merge=lfs -text
38
+ 5m-monarch/final.jld2 filter=lfs diff=lfs merge=lfs -text
39
+ 5m-monarch/step_12000.jld2 filter=lfs diff=lfs merge=lfs -text
5m-monarch/config.toml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 5M Monarch Mixer config — same param budget as 5m.toml
2
+ # 8 blocks (vs 6 for transformer) due to cheaper sequence mixing
3
+ # Monarch sequence mixer: 67K params/block vs 262K for attention
4
+
5
+ [model]
6
+ arch = "monarch"
7
+ embed_dim = 256
8
+ n_layers = 8
9
+ n_heads = 4 # unused by Monarch, kept for struct compat
10
+ head_dim = 64 # unused by Monarch
11
+ n_monarch_heads = 8
12
+ conv_kernel_size = 4
13
+ ffn_mult = 4
14
+ context_length = 256
15
+ dropout = 0.0
16
+ bias = false
17
+ weight_tying = true
18
+
19
+ [training]
20
+ optimizer = "adamw"
21
+ lr = 6e-4
22
+ min_lr = 6e-5
23
+ warmup_steps = 500
24
+ max_steps = 12305
25
+ batch_size = 32
26
+ grad_clip = 1.0
27
+ precision = "f16"
28
+ eval_interval = 500
29
+ eval_steps = 25
30
+ checkpoint_interval = 2000
31
+ seed = 42
32
+
33
+ [training.curriculum]
34
+ enabled = false
35
+
36
+ [training.coreset]
37
+ enabled = false
38
+
39
+ [data]
40
+ train_path = "../text-pipeline/output/train.txt"
41
+ val_path = "../text-pipeline/output/val.txt"
42
+ tokenizer_dir = "../text-pipeline/output"
43
+
44
+ [inference]
45
+ precision = "f16"
46
+ compile = false
47
+ temperature = 0.8
48
+ top_k = 40
49
+ max_new_tokens = 500
5m-monarch/final.jld2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5364f0702509441887b48e740aa155436b25ff6967fde9c5825a54c921456ee2
3
+ size 76817788
5m-monarch/step_12000.jld2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9252801d35e3f4ae6134a8535be8dca0b7262351958ac6d5af78bdf2d0b21b1
3
+ size 76817788