Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- d12/meta_000100.json +49 -0
- d12/model_000100.pt +3 -0
- d12/optim_000100_rank0.pt +3 -0
- d12/optim_000100_rank1.pt +3 -0
- d12/optim_000100_rank2.pt +3 -0
- d12/optim_000100_rank3.pt +3 -0
- d3/meta_001000.json +49 -0
- d3/model_001000.pt +3 -0
- d3/optim_001000_rank0.pt +3 -0
- d3/optim_001000_rank1.pt +3 -0
- d3/optim_001000_rank2.pt +3 -0
- d3/optim_001000_rank3.pt +3 -0
- d6/meta_001000.json +49 -0
- d6/meta_002000.json +49 -0
- d6/model_001000.pt +3 -0
- d6/model_002000.pt +3 -0
- d6/optim_001000_rank0.pt +3 -0
- d6/optim_001000_rank1.pt +3 -0
- d6/optim_001000_rank2.pt +3 -0
- d6/optim_001000_rank3.pt +3 -0
- d6/optim_002000_rank0.pt +3 -0
- d6/optim_002000_rank1.pt +3 -0
- d6/optim_002000_rank2.pt +3 -0
- d6/optim_002000_rank3.pt +3 -0
- d8/meta_000100.json +49 -0
- d8/meta_000200.json +49 -0
- d8/meta_000300.json +49 -0
- d8/meta_000400.json +49 -0
- d8/meta_000500.json +49 -0
- d8/model_000100.pt +3 -0
- d8/model_000200.pt +3 -0
- d8/model_000300.pt +3 -0
- d8/model_000400.pt +3 -0
- d8/model_000500.pt +3 -0
- d8/optim_000100_rank0.pt +3 -0
- d8/optim_000100_rank1.pt +3 -0
- d8/optim_000100_rank2.pt +3 -0
- d8/optim_000100_rank3.pt +3 -0
- d8/optim_000200_rank0.pt +3 -0
- d8/optim_000200_rank1.pt +3 -0
- d8/optim_000200_rank2.pt +3 -0
- d8/optim_000200_rank3.pt +3 -0
- d8/optim_000300_rank0.pt +3 -0
- d8/optim_000300_rank1.pt +3 -0
- d8/optim_000300_rank2.pt +3 -0
- d8/optim_000300_rank3.pt +3 -0
- d8/optim_000400_rank0.pt +3 -0
- d8/optim_000400_rank1.pt +3 -0
- d8/optim_000400_rank2.pt +3 -0
- d8/optim_000400_rank3.pt +3 -0
d12/meta_000100.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 100,
|
| 3 |
+
"val_bpb": 0.0,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 12,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 8192
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "bdh_train_21",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 12,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 5000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.01,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 1000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 2500,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 100,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 400
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": Infinity,
|
| 46 |
+
"smooth_train_loss": 6.405434688995689,
|
| 47 |
+
"total_training_time": 248.85700273513794
|
| 48 |
+
}
|
| 49 |
+
}
|
d12/model_000100.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:350a313a0f95f93e85d5a9a9185db893b449a719d6263a8f4ff1ad81b1570f06
|
| 3 |
+
size 318778513
|
d12/optim_000100_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49efd5ddbf90ddd3270d9508ecf873c941811af0f0411cd00ac50096899345f8
|
| 3 |
+
size 587207086
|
d12/optim_000100_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acf93b2aa7e192493e2027466ec5197220d8097456a7882ad35c53960523431c
|
| 3 |
+
size 587207086
|
d12/optim_000100_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2f95feb521e0ecd10be8baa92ed733e615ba6fa3c9d1d9300125881ac02886f
|
| 3 |
+
size 587207086
|
d12/optim_000100_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef19245e2f6fcd081abfb47e34cd97ddbe87d3f327635d1ffb0a2bfc3dd12b57
|
| 3 |
+
size 587207086
|
d3/meta_001000.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 1000,
|
| 3 |
+
"val_bpb": 0.0,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 6,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 32768
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "test_run3",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 6,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 2000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 4,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.6,
|
| 29 |
+
"final_lr_frac": 0.1,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 10000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 10000,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 1000,
|
| 37 |
+
"model_tag": "d3"
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 4,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 8000
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": Infinity,
|
| 46 |
+
"smooth_train_loss": 4.2489351594465,
|
| 47 |
+
"total_training_time": 4557.3429136276245
|
| 48 |
+
}
|
| 49 |
+
}
|
d3/model_001000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d348e3aad5bde0da72568505dc431ef2ce961fc9447d618f0b3592b520640cd8
|
| 3 |
+
size 469798033
|
d3/optim_001000_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a349cad489a4d41c243c7dbf79cd78a2a539af26b1ea2fabdd7b47eede8633ed
|
| 3 |
+
size 738202030
|
d3/optim_001000_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5fd518a160b3c768b51dbfeeb462aa0c4a2b39ec9432a2f014556ce518317e80
|
| 3 |
+
size 738202030
|
d3/optim_001000_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a19e62f7f4714d8a0e57171b26ba75a0c9f409fb3c712611c0a9efbce3ebc9c
|
| 3 |
+
size 738202030
|
d3/optim_001000_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8bcb099efb36c8967298928c8b98972b31de65153cce1fca1cbe72c4877991b
|
| 3 |
+
size 738202030
|
d6/meta_001000.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 1000,
|
| 3 |
+
"val_bpb": 0.0,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 6,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 16384
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "test_run3",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 6,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 2000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 10000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 10000,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 1000,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 4000
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": Infinity,
|
| 46 |
+
"smooth_train_loss": 4.243802949414332,
|
| 47 |
+
"total_training_time": 2651.2226946353912
|
| 48 |
+
}
|
| 49 |
+
}
|
d6/meta_002000.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 2000,
|
| 3 |
+
"val_bpb": 0.0,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 6,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 16384
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "test_run3",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 6,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 2000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 10000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 10000,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 1000,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 8000
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": Infinity,
|
| 46 |
+
"smooth_train_loss": 3.8795334495346396,
|
| 47 |
+
"total_training_time": 5321.21687579155
|
| 48 |
+
}
|
| 49 |
+
}
|
d6/model_001000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9acfdfef4607abc79fc7b6c2191ca752aec466b92ea15064fd13a98e55e4559c
|
| 3 |
+
size 369118353
|
d6/model_002000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1abe8a2ade9b6873bd0d155a66205244aa3aaeadd657e010fa1fe88b0d38739a
|
| 3 |
+
size 369118353
|
d6/optim_001000_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e384b508f4a4c348424b7d731703e534e0254d9a784e13201e8f7ac7899dffd7
|
| 3 |
+
size 637538734
|
d6/optim_001000_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:109b020f97bc51e0979cd2fab0c1aeb5d8285e016ee539dc87280f9bc5b268cf
|
| 3 |
+
size 637538734
|
d6/optim_001000_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dad68a40103db6cc038fcbd947a75cb88841f7d3384d1b5e2fe38b33ba2e2e57
|
| 3 |
+
size 637538734
|
d6/optim_001000_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3153c46837f228f3b64e502ffe00618cfd85b430c2bbb64860662ec7fa31c6fd
|
| 3 |
+
size 637538734
|
d6/optim_002000_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73384edc87adabd2ae90304351289dc3c8b7eec1ed9e9574ad439dd079ebc680
|
| 3 |
+
size 637538734
|
d6/optim_002000_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f3f113362a46e7d403daadb5b0ac73671e331a6e2e97ada97c04663a43e8210
|
| 3 |
+
size 637538734
|
d6/optim_002000_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac958818c53cb81bf6f3ad687b669fae2adfa6c58769926bcf5df8112bda8891
|
| 3 |
+
size 637538734
|
d6/optim_002000_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:066f32f49e37cba03c5cc025bf3606deb3422d956dbe52fe2e3e10a376bf22af
|
| 3 |
+
size 637538734
|
d8/meta_000100.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 100,
|
| 3 |
+
"val_bpb": 3.3393806087790194,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 8,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 8192
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "bdh_train_21",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 8,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 5000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 1000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 2500,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 100,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 400
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": 3.3393806087790194,
|
| 46 |
+
"smooth_train_loss": 6.111884546321404,
|
| 47 |
+
"total_training_time": 356.197368144989
|
| 48 |
+
}
|
| 49 |
+
}
|
d8/meta_000200.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 200,
|
| 3 |
+
"val_bpb": 3.3393806087790194,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 8,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 8192
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "bdh_train_21",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 8,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 5000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 1000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 2500,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 100,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 800
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": 3.3393806087790194,
|
| 46 |
+
"smooth_train_loss": 5.6278228485674235,
|
| 47 |
+
"total_training_time": 713.754873752594
|
| 48 |
+
}
|
| 49 |
+
}
|
d8/meta_000300.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 300,
|
| 3 |
+
"val_bpb": 3.3393806087790194,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 8,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 8192
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "bdh_train_21",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 8,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 5000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 1000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 2500,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 100,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 1200
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": 3.3393806087790194,
|
| 46 |
+
"smooth_train_loss": 5.214926582822775,
|
| 47 |
+
"total_training_time": 916.8687179088593
|
| 48 |
+
}
|
| 49 |
+
}
|
d8/meta_000400.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 400,
|
| 3 |
+
"val_bpb": 3.3393806087790194,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 8,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 8192
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "bdh_train_21",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 8,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 5000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 1000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 2500,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 100,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 1600
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": 3.3393806087790194,
|
| 46 |
+
"smooth_train_loss": 4.894732005734474,
|
| 47 |
+
"total_training_time": 1114.4314217567444
|
| 48 |
+
}
|
| 49 |
+
}
|
d8/meta_000500.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 500,
|
| 3 |
+
"val_bpb": 3.3393806087790194,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"max_seq_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 8,
|
| 8 |
+
"n_head": 4,
|
| 9 |
+
"n_embd": 512,
|
| 10 |
+
"n_hidden": 8192
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "bdh_train_21",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 8,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 5000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 8,
|
| 21 |
+
"total_batch_size": 524288,
|
| 22 |
+
"embedding_lr": 0.002,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.1,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"resume_from_step": -1,
|
| 31 |
+
"eval_every": 1000,
|
| 32 |
+
"eval_tokens": 10485760,
|
| 33 |
+
"core_metric_every": 2500,
|
| 34 |
+
"core_metric_max_per_task": 500,
|
| 35 |
+
"sample_every": 100,
|
| 36 |
+
"save_every": 100,
|
| 37 |
+
"model_tag": ""
|
| 38 |
+
},
|
| 39 |
+
"device_batch_size": 8,
|
| 40 |
+
"max_seq_len": 2048,
|
| 41 |
+
"dataloader_state_dict": {
|
| 42 |
+
"batches_yielded": 2000
|
| 43 |
+
},
|
| 44 |
+
"loop_state": {
|
| 45 |
+
"min_val_bpb": 3.3393806087790194,
|
| 46 |
+
"smooth_train_loss": 4.7479387433966025,
|
| 47 |
+
"total_training_time": 1310.6406755447388
|
| 48 |
+
}
|
| 49 |
+
}
|
d8/model_000100.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3dbd2b8634930f46d88cad5e1bbb7efbd1c0b387e72ae0a0087f6fa1a04f52e9
|
| 3 |
+
size 318778513
|
d8/model_000200.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5909ac0d077f1037f8851d9ea3dd0d4993d4b559758aa2ae0816409448b7fe29
|
| 3 |
+
size 318778513
|
d8/model_000300.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43a9d871834ffb01102d6c8372f052e687e42c26b20829c3248e7464de234467
|
| 3 |
+
size 318778513
|
d8/model_000400.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c8af47fc3b1aa8be0e153763a91c8b79a62bca2b2d6cf103f56e53379c91038
|
| 3 |
+
size 318778513
|
d8/model_000500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8de8e3592eeb32ec6a051c8e0d482e7e03aeb6d853139abb0c6d97e3201f5799
|
| 3 |
+
size 318778513
|
d8/optim_000100_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5bdf3a6541d78b8e2471ef45705dc110474755a6ce3b00273b69c4731643a2b
|
| 3 |
+
size 587207086
|
d8/optim_000100_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a10b5473e7991b83280b37e0d15850155635da012e5afa6639aa8daeab6323bd
|
| 3 |
+
size 587207086
|
d8/optim_000100_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51df7ed5ebb99811f252a309b4e9977c544043cbb0b18d238780fefd40c818db
|
| 3 |
+
size 587207086
|
d8/optim_000100_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58d890e2cd4e6e946762dd8587620399cf3dcc5f73b6757cad60e49f7dcd1f68
|
| 3 |
+
size 587207086
|
d8/optim_000200_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6d49d47b3177cd34ebb323a6156695b809c117bcb8c43a3fc65c42e6314fe98
|
| 3 |
+
size 587207086
|
d8/optim_000200_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75ab518579673d93b7b185797fa3e1ea78e9b0ee408086499cb898cdb5adefcb
|
| 3 |
+
size 587207086
|
d8/optim_000200_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2972685e0aca24873fc3d001fc6f4f8ff00196da808dce00ffda8909d38791f2
|
| 3 |
+
size 587207086
|
d8/optim_000200_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7de15a53569b744e8a8594d6c1c2d624d47898d9a78bcb733a9468094c8fbf6
|
| 3 |
+
size 587207086
|
d8/optim_000300_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:544300413606fcff1a28d12d248ba1afbb79d758c7ef3dd03d4b1c9e8089dd9e
|
| 3 |
+
size 587207086
|
d8/optim_000300_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17f05728002a1f1fc5b776bf74a5e75e5da8a067c4d53fd5a00eeb7d88280149
|
| 3 |
+
size 587207086
|
d8/optim_000300_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aac3be5f855f039e0ce42ce760593eddaa910e1fedc289e1100e3a9ab71d071
|
| 3 |
+
size 587207086
|
d8/optim_000300_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efec265a0e42f1ff074645da4224e56f85ae96ae1229d2fbf96737962d2042af
|
| 3 |
+
size 587207086
|
d8/optim_000400_rank0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61def9bf8011e0328bfc0d880f37a9866bcf99a96bc310f516253689f7fd37d0
|
| 3 |
+
size 587207086
|
d8/optim_000400_rank1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0645de31aefc64d863931e2d1642b3c21db43e1d77d86d711d8912c81485acc4
|
| 3 |
+
size 587207086
|
d8/optim_000400_rank2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38a89c97f46ccc153fa73813675ccfcd88fd97d7f72f943912f00349f5fb93d1
|
| 3 |
+
size 587207086
|
d8/optim_000400_rank3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6fe2a60db1f8f0711f7f5207ab97e5082f278eeb819023e77ae918de0f377af
|
| 3 |
+
size 587207086
|