ByteMaster01 commited on
Commit
22d3a75
·
verified ·
1 Parent(s): cc71e9f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
d12/meta_000100.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 100,
3
+ "val_bpb": 0.0,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 12,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 8192
11
+ },
12
+ "user_config": {
13
+ "run": "bdh_train_21",
14
+ "device_type": "",
15
+ "depth": 12,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 5000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.01,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 1000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 2500,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 100,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 400
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": Infinity,
46
+ "smooth_train_loss": 6.405434688995689,
47
+ "total_training_time": 248.85700273513794
48
+ }
49
+ }
d12/model_000100.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350a313a0f95f93e85d5a9a9185db893b449a719d6263a8f4ff1ad81b1570f06
3
+ size 318778513
d12/optim_000100_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49efd5ddbf90ddd3270d9508ecf873c941811af0f0411cd00ac50096899345f8
3
+ size 587207086
d12/optim_000100_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acf93b2aa7e192493e2027466ec5197220d8097456a7882ad35c53960523431c
3
+ size 587207086
d12/optim_000100_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f95feb521e0ecd10be8baa92ed733e615ba6fa3c9d1d9300125881ac02886f
3
+ size 587207086
d12/optim_000100_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef19245e2f6fcd081abfb47e34cd97ddbe87d3f327635d1ffb0a2bfc3dd12b57
3
+ size 587207086
d3/meta_001000.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 1000,
3
+ "val_bpb": 0.0,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 6,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 32768
11
+ },
12
+ "user_config": {
13
+ "run": "test_run3",
14
+ "device_type": "",
15
+ "depth": 6,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 2000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 4,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.6,
29
+ "final_lr_frac": 0.1,
30
+ "resume_from_step": -1,
31
+ "eval_every": 10000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 10000,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 1000,
37
+ "model_tag": "d3"
38
+ },
39
+ "device_batch_size": 4,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 8000
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": Infinity,
46
+ "smooth_train_loss": 4.2489351594465,
47
+ "total_training_time": 4557.3429136276245
48
+ }
49
+ }
d3/model_001000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d348e3aad5bde0da72568505dc431ef2ce961fc9447d618f0b3592b520640cd8
3
+ size 469798033
d3/optim_001000_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a349cad489a4d41c243c7dbf79cd78a2a539af26b1ea2fabdd7b47eede8633ed
3
+ size 738202030
d3/optim_001000_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd518a160b3c768b51dbfeeb462aa0c4a2b39ec9432a2f014556ce518317e80
3
+ size 738202030
d3/optim_001000_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a19e62f7f4714d8a0e57171b26ba75a0c9f409fb3c712611c0a9efbce3ebc9c
3
+ size 738202030
d3/optim_001000_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8bcb099efb36c8967298928c8b98972b31de65153cce1fca1cbe72c4877991b
3
+ size 738202030
d6/meta_001000.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 1000,
3
+ "val_bpb": 0.0,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 6,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 16384
11
+ },
12
+ "user_config": {
13
+ "run": "test_run3",
14
+ "device_type": "",
15
+ "depth": 6,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 2000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 10000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 10000,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 1000,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 4000
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": Infinity,
46
+ "smooth_train_loss": 4.243802949414332,
47
+ "total_training_time": 2651.2226946353912
48
+ }
49
+ }
d6/meta_002000.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 2000,
3
+ "val_bpb": 0.0,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 6,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 16384
11
+ },
12
+ "user_config": {
13
+ "run": "test_run3",
14
+ "device_type": "",
15
+ "depth": 6,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 2000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 10000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 10000,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 1000,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 8000
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": Infinity,
46
+ "smooth_train_loss": 3.8795334495346396,
47
+ "total_training_time": 5321.21687579155
48
+ }
49
+ }
d6/model_001000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9acfdfef4607abc79fc7b6c2191ca752aec466b92ea15064fd13a98e55e4559c
3
+ size 369118353
d6/model_002000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1abe8a2ade9b6873bd0d155a66205244aa3aaeadd657e010fa1fe88b0d38739a
3
+ size 369118353
d6/optim_001000_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e384b508f4a4c348424b7d731703e534e0254d9a784e13201e8f7ac7899dffd7
3
+ size 637538734
d6/optim_001000_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109b020f97bc51e0979cd2fab0c1aeb5d8285e016ee539dc87280f9bc5b268cf
3
+ size 637538734
d6/optim_001000_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dad68a40103db6cc038fcbd947a75cb88841f7d3384d1b5e2fe38b33ba2e2e57
3
+ size 637538734
d6/optim_001000_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3153c46837f228f3b64e502ffe00618cfd85b430c2bbb64860662ec7fa31c6fd
3
+ size 637538734
d6/optim_002000_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73384edc87adabd2ae90304351289dc3c8b7eec1ed9e9574ad439dd079ebc680
3
+ size 637538734
d6/optim_002000_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f3f113362a46e7d403daadb5b0ac73671e331a6e2e97ada97c04663a43e8210
3
+ size 637538734
d6/optim_002000_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac958818c53cb81bf6f3ad687b669fae2adfa6c58769926bcf5df8112bda8891
3
+ size 637538734
d6/optim_002000_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:066f32f49e37cba03c5cc025bf3606deb3422d956dbe52fe2e3e10a376bf22af
3
+ size 637538734
d8/meta_000100.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 100,
3
+ "val_bpb": 3.3393806087790194,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 8192
11
+ },
12
+ "user_config": {
13
+ "run": "bdh_train_21",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 5000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 1000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 2500,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 100,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 400
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": 3.3393806087790194,
46
+ "smooth_train_loss": 6.111884546321404,
47
+ "total_training_time": 356.197368144989
48
+ }
49
+ }
d8/meta_000200.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 200,
3
+ "val_bpb": 3.3393806087790194,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 8192
11
+ },
12
+ "user_config": {
13
+ "run": "bdh_train_21",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 5000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 1000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 2500,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 100,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 800
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": 3.3393806087790194,
46
+ "smooth_train_loss": 5.6278228485674235,
47
+ "total_training_time": 713.754873752594
48
+ }
49
+ }
d8/meta_000300.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 300,
3
+ "val_bpb": 3.3393806087790194,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 8192
11
+ },
12
+ "user_config": {
13
+ "run": "bdh_train_21",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 5000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 1000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 2500,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 100,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 1200
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": 3.3393806087790194,
46
+ "smooth_train_loss": 5.214926582822775,
47
+ "total_training_time": 916.8687179088593
48
+ }
49
+ }
d8/meta_000400.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 400,
3
+ "val_bpb": 3.3393806087790194,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 8192
11
+ },
12
+ "user_config": {
13
+ "run": "bdh_train_21",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 5000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 1000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 2500,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 100,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 1600
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": 3.3393806087790194,
46
+ "smooth_train_loss": 4.894732005734474,
47
+ "total_training_time": 1114.4314217567444
48
+ }
49
+ }
d8/meta_000500.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 500,
3
+ "val_bpb": 3.3393806087790194,
4
+ "model_config": {
5
+ "max_seq_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_embd": 512,
10
+ "n_hidden": 8192
11
+ },
12
+ "user_config": {
13
+ "run": "bdh_train_21",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 2048,
17
+ "num_iterations": 5000,
18
+ "target_flops": -1.0,
19
+ "target_param_data_ratio": 20,
20
+ "device_batch_size": 8,
21
+ "total_batch_size": 524288,
22
+ "embedding_lr": 0.002,
23
+ "unembedding_lr": 0.004,
24
+ "weight_decay": 0.1,
25
+ "matrix_lr": 0.02,
26
+ "grad_clip": 1.0,
27
+ "warmup_ratio": 0.0,
28
+ "warmdown_ratio": 0.2,
29
+ "final_lr_frac": 0.0,
30
+ "resume_from_step": -1,
31
+ "eval_every": 1000,
32
+ "eval_tokens": 10485760,
33
+ "core_metric_every": 2500,
34
+ "core_metric_max_per_task": 500,
35
+ "sample_every": 100,
36
+ "save_every": 100,
37
+ "model_tag": ""
38
+ },
39
+ "device_batch_size": 8,
40
+ "max_seq_len": 2048,
41
+ "dataloader_state_dict": {
42
+ "batches_yielded": 2000
43
+ },
44
+ "loop_state": {
45
+ "min_val_bpb": 3.3393806087790194,
46
+ "smooth_train_loss": 4.7479387433966025,
47
+ "total_training_time": 1310.6406755447388
48
+ }
49
+ }
d8/model_000100.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dbd2b8634930f46d88cad5e1bbb7efbd1c0b387e72ae0a0087f6fa1a04f52e9
3
+ size 318778513
d8/model_000200.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5909ac0d077f1037f8851d9ea3dd0d4993d4b559758aa2ae0816409448b7fe29
3
+ size 318778513
d8/model_000300.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43a9d871834ffb01102d6c8372f052e687e42c26b20829c3248e7464de234467
3
+ size 318778513
d8/model_000400.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c8af47fc3b1aa8be0e153763a91c8b79a62bca2b2d6cf103f56e53379c91038
3
+ size 318778513
d8/model_000500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8de8e3592eeb32ec6a051c8e0d482e7e03aeb6d853139abb0c6d97e3201f5799
3
+ size 318778513
d8/optim_000100_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5bdf3a6541d78b8e2471ef45705dc110474755a6ce3b00273b69c4731643a2b
3
+ size 587207086
d8/optim_000100_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a10b5473e7991b83280b37e0d15850155635da012e5afa6639aa8daeab6323bd
3
+ size 587207086
d8/optim_000100_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51df7ed5ebb99811f252a309b4e9977c544043cbb0b18d238780fefd40c818db
3
+ size 587207086
d8/optim_000100_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58d890e2cd4e6e946762dd8587620399cf3dcc5f73b6757cad60e49f7dcd1f68
3
+ size 587207086
d8/optim_000200_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d49d47b3177cd34ebb323a6156695b809c117bcb8c43a3fc65c42e6314fe98
3
+ size 587207086
d8/optim_000200_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ab518579673d93b7b185797fa3e1ea78e9b0ee408086499cb898cdb5adefcb
3
+ size 587207086
d8/optim_000200_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2972685e0aca24873fc3d001fc6f4f8ff00196da808dce00ffda8909d38791f2
3
+ size 587207086
d8/optim_000200_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7de15a53569b744e8a8594d6c1c2d624d47898d9a78bcb733a9468094c8fbf6
3
+ size 587207086
d8/optim_000300_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:544300413606fcff1a28d12d248ba1afbb79d758c7ef3dd03d4b1c9e8089dd9e
3
+ size 587207086
d8/optim_000300_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f05728002a1f1fc5b776bf74a5e75e5da8a067c4d53fd5a00eeb7d88280149
3
+ size 587207086
d8/optim_000300_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aac3be5f855f039e0ce42ce760593eddaa910e1fedc289e1100e3a9ab71d071
3
+ size 587207086
d8/optim_000300_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efec265a0e42f1ff074645da4224e56f85ae96ae1229d2fbf96737962d2042af
3
+ size 587207086
d8/optim_000400_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61def9bf8011e0328bfc0d880f37a9866bcf99a96bc310f516253689f7fd37d0
3
+ size 587207086
d8/optim_000400_rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0645de31aefc64d863931e2d1642b3c21db43e1d77d86d711d8912c81485acc4
3
+ size 587207086
d8/optim_000400_rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38a89c97f46ccc153fa73813675ccfcd88fd97d7f72f943912f00349f5fb93d1
3
+ size 587207086
d8/optim_000400_rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6fe2a60db1f8f0711f7f5207ab97e5082f278eeb819023e77ae918de0f377af
3
+ size 587207086