craa commited on
Commit
4f2325f
·
verified ·
1 Parent(s): 3150974

Training in progress, step 20000, checkpoint

Browse files
checkpoint-20000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.47.0.dev0",
29
+ "use_cache": true,
30
+ "vocab_size": 52000
31
+ }
checkpoint-20000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.47.0.dev0"
6
+ }
checkpoint-20000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c7bf1da973c281e5fe11e7e6dc1ff45138c76551662980c067755fe27a5fd8e
3
+ size 503128704
checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9a89f649aa95ec8991c173c5a647ec970fec177fbea3ac3aed13fec8eb6590f
3
+ size 1006351290
checkpoint-20000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d15aa29e940ac62de0085b65053e52ce4711ff337a969c82de576589e02694
3
+ size 14244
checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4081d50f1887f3a982b2f3cabb2e58590020a0424181d8b5b776143e4ebaaf72
3
+ size 1064
checkpoint-20000/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-20000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/trainer_state.json ADDED
@@ -0,0 +1,3013 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.5808968544006348,
3
+ "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__495/checkpoint-20000",
4
+ "epoch": 2.1563342318059298,
5
+ "eval_steps": 1000,
6
+ "global_step": 20000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005390835579514825,
13
+ "grad_norm": 0.9464320540428162,
14
+ "learning_rate": 0.000276,
15
+ "loss": 8.7571,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.01078167115902965,
20
+ "grad_norm": 1.1756318807601929,
21
+ "learning_rate": 0.0005759999999999999,
22
+ "loss": 7.0078,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.016172506738544475,
27
+ "grad_norm": 1.551442265510559,
28
+ "learning_rate": 0.000599702104695089,
29
+ "loss": 6.6048,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.0215633423180593,
34
+ "grad_norm": 0.7034249901771545,
35
+ "learning_rate": 0.0005993783054506205,
36
+ "loss": 6.3332,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.026954177897574125,
41
+ "grad_norm": 1.0177743434906006,
42
+ "learning_rate": 0.0005990545062061521,
43
+ "loss": 6.1389,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.03234501347708895,
48
+ "grad_norm": 1.3281992673873901,
49
+ "learning_rate": 0.0005987307069616836,
50
+ "loss": 6.0274,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.03773584905660377,
55
+ "grad_norm": 1.3769625425338745,
56
+ "learning_rate": 0.0005984069077172153,
57
+ "loss": 5.9227,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.0431266846361186,
62
+ "grad_norm": 1.3584883213043213,
63
+ "learning_rate": 0.0005980831084727469,
64
+ "loss": 5.8655,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.04851752021563342,
69
+ "grad_norm": 0.9795990586280823,
70
+ "learning_rate": 0.0005977593092282784,
71
+ "loss": 5.78,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.05390835579514825,
76
+ "grad_norm": 1.4081448316574097,
77
+ "learning_rate": 0.00059743550998381,
78
+ "loss": 5.7304,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.05929919137466307,
83
+ "grad_norm": 1.7430726289749146,
84
+ "learning_rate": 0.0005971117107393416,
85
+ "loss": 5.6254,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.0646900269541779,
90
+ "grad_norm": 1.4288103580474854,
91
+ "learning_rate": 0.0005967879114948732,
92
+ "loss": 5.5997,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.07008086253369272,
97
+ "grad_norm": 0.9352820515632629,
98
+ "learning_rate": 0.0005964641122504047,
99
+ "loss": 5.5009,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.07547169811320754,
104
+ "grad_norm": 1.0235956907272339,
105
+ "learning_rate": 0.0005961403130059363,
106
+ "loss": 5.4834,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.08086253369272237,
111
+ "grad_norm": 1.1283656358718872,
112
+ "learning_rate": 0.0005958165137614678,
113
+ "loss": 5.4096,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.0862533692722372,
118
+ "grad_norm": 1.1757662296295166,
119
+ "learning_rate": 0.0005954927145169995,
120
+ "loss": 5.3508,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.09164420485175202,
125
+ "grad_norm": 1.2652894258499146,
126
+ "learning_rate": 0.0005951689152725309,
127
+ "loss": 5.3137,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.09703504043126684,
132
+ "grad_norm": 0.8220955729484558,
133
+ "learning_rate": 0.0005948451160280626,
134
+ "loss": 5.234,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.10242587601078167,
139
+ "grad_norm": 1.0178474187850952,
140
+ "learning_rate": 0.0005945213167835941,
141
+ "loss": 5.2211,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.1078167115902965,
146
+ "grad_norm": 1.0003489255905151,
147
+ "learning_rate": 0.0005941975175391257,
148
+ "loss": 5.1665,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.1078167115902965,
153
+ "eval_accuracy": 0.2206364212520268,
154
+ "eval_loss": 5.09123420715332,
155
+ "eval_runtime": 146.3271,
156
+ "eval_samples_per_second": 123.087,
157
+ "eval_steps_per_second": 7.695,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 0.11320754716981132,
162
+ "grad_norm": 1.0129122734069824,
163
+ "learning_rate": 0.0005938737182946572,
164
+ "loss": 5.1253,
165
+ "step": 1050
166
+ },
167
+ {
168
+ "epoch": 0.11859838274932614,
169
+ "grad_norm": 1.2447277307510376,
170
+ "learning_rate": 0.0005935499190501888,
171
+ "loss": 5.0843,
172
+ "step": 1100
173
+ },
174
+ {
175
+ "epoch": 0.12398921832884097,
176
+ "grad_norm": 1.1244721412658691,
177
+ "learning_rate": 0.0005932261198057204,
178
+ "loss": 5.0632,
179
+ "step": 1150
180
+ },
181
+ {
182
+ "epoch": 0.1293800539083558,
183
+ "grad_norm": 1.0002185106277466,
184
+ "learning_rate": 0.000592902320561252,
185
+ "loss": 5.0409,
186
+ "step": 1200
187
+ },
188
+ {
189
+ "epoch": 0.1347708894878706,
190
+ "grad_norm": 1.237203598022461,
191
+ "learning_rate": 0.0005925785213167835,
192
+ "loss": 5.0158,
193
+ "step": 1250
194
+ },
195
+ {
196
+ "epoch": 0.14016172506738545,
197
+ "grad_norm": 0.9483816623687744,
198
+ "learning_rate": 0.0005922547220723151,
199
+ "loss": 4.9578,
200
+ "step": 1300
201
+ },
202
+ {
203
+ "epoch": 0.14555256064690028,
204
+ "grad_norm": 1.1492619514465332,
205
+ "learning_rate": 0.0005919309228278468,
206
+ "loss": 4.9422,
207
+ "step": 1350
208
+ },
209
+ {
210
+ "epoch": 0.1509433962264151,
211
+ "grad_norm": 0.8103901743888855,
212
+ "learning_rate": 0.0005916071235833783,
213
+ "loss": 4.9161,
214
+ "step": 1400
215
+ },
216
+ {
217
+ "epoch": 0.15633423180592992,
218
+ "grad_norm": 0.9849348068237305,
219
+ "learning_rate": 0.0005912833243389097,
220
+ "loss": 4.8937,
221
+ "step": 1450
222
+ },
223
+ {
224
+ "epoch": 0.16172506738544473,
225
+ "grad_norm": 1.0992136001586914,
226
+ "learning_rate": 0.0005909595250944414,
227
+ "loss": 4.8771,
228
+ "step": 1500
229
+ },
230
+ {
231
+ "epoch": 0.16711590296495957,
232
+ "grad_norm": 0.7955754995346069,
233
+ "learning_rate": 0.000590635725849973,
234
+ "loss": 4.8337,
235
+ "step": 1550
236
+ },
237
+ {
238
+ "epoch": 0.1725067385444744,
239
+ "grad_norm": 0.8510985970497131,
240
+ "learning_rate": 0.0005903119266055045,
241
+ "loss": 4.8487,
242
+ "step": 1600
243
+ },
244
+ {
245
+ "epoch": 0.1778975741239892,
246
+ "grad_norm": 0.8423263430595398,
247
+ "learning_rate": 0.0005899881273610361,
248
+ "loss": 4.8251,
249
+ "step": 1650
250
+ },
251
+ {
252
+ "epoch": 0.18328840970350405,
253
+ "grad_norm": 0.8307051658630371,
254
+ "learning_rate": 0.0005896643281165677,
255
+ "loss": 4.7528,
256
+ "step": 1700
257
+ },
258
+ {
259
+ "epoch": 0.18867924528301888,
260
+ "grad_norm": 1.171060562133789,
261
+ "learning_rate": 0.0005893405288720993,
262
+ "loss": 4.7617,
263
+ "step": 1750
264
+ },
265
+ {
266
+ "epoch": 0.1940700808625337,
267
+ "grad_norm": 1.0469765663146973,
268
+ "learning_rate": 0.0005890167296276308,
269
+ "loss": 4.7597,
270
+ "step": 1800
271
+ },
272
+ {
273
+ "epoch": 0.19946091644204852,
274
+ "grad_norm": 0.8637204170227051,
275
+ "learning_rate": 0.0005886929303831624,
276
+ "loss": 4.7128,
277
+ "step": 1850
278
+ },
279
+ {
280
+ "epoch": 0.20485175202156333,
281
+ "grad_norm": 0.9173099398612976,
282
+ "learning_rate": 0.0005883691311386939,
283
+ "loss": 4.7067,
284
+ "step": 1900
285
+ },
286
+ {
287
+ "epoch": 0.21024258760107817,
288
+ "grad_norm": 0.9003922343254089,
289
+ "learning_rate": 0.0005880453318942256,
290
+ "loss": 4.6783,
291
+ "step": 1950
292
+ },
293
+ {
294
+ "epoch": 0.215633423180593,
295
+ "grad_norm": 0.9870163798332214,
296
+ "learning_rate": 0.0005877215326497571,
297
+ "loss": 4.6555,
298
+ "step": 2000
299
+ },
300
+ {
301
+ "epoch": 0.215633423180593,
302
+ "eval_accuracy": 0.260708366848222,
303
+ "eval_loss": 4.6007513999938965,
304
+ "eval_runtime": 144.747,
305
+ "eval_samples_per_second": 124.431,
306
+ "eval_steps_per_second": 7.779,
307
+ "step": 2000
308
+ },
309
+ {
310
+ "epoch": 0.2210242587601078,
311
+ "grad_norm": 0.8778396248817444,
312
+ "learning_rate": 0.0005873977334052887,
313
+ "loss": 4.6696,
314
+ "step": 2050
315
+ },
316
+ {
317
+ "epoch": 0.22641509433962265,
318
+ "grad_norm": 0.8375086188316345,
319
+ "learning_rate": 0.0005870739341608202,
320
+ "loss": 4.6395,
321
+ "step": 2100
322
+ },
323
+ {
324
+ "epoch": 0.23180592991913745,
325
+ "grad_norm": 0.8078502416610718,
326
+ "learning_rate": 0.0005867501349163519,
327
+ "loss": 4.5823,
328
+ "step": 2150
329
+ },
330
+ {
331
+ "epoch": 0.2371967654986523,
332
+ "grad_norm": 0.8603857755661011,
333
+ "learning_rate": 0.0005864263356718833,
334
+ "loss": 4.6175,
335
+ "step": 2200
336
+ },
337
+ {
338
+ "epoch": 0.24258760107816713,
339
+ "grad_norm": 0.8307099342346191,
340
+ "learning_rate": 0.000586102536427415,
341
+ "loss": 4.5541,
342
+ "step": 2250
343
+ },
344
+ {
345
+ "epoch": 0.24797843665768193,
346
+ "grad_norm": 1.0236228704452515,
347
+ "learning_rate": 0.0005857787371829465,
348
+ "loss": 4.5368,
349
+ "step": 2300
350
+ },
351
+ {
352
+ "epoch": 0.25336927223719674,
353
+ "grad_norm": 0.9307533502578735,
354
+ "learning_rate": 0.0005854549379384781,
355
+ "loss": 4.5326,
356
+ "step": 2350
357
+ },
358
+ {
359
+ "epoch": 0.2587601078167116,
360
+ "grad_norm": 1.4191182851791382,
361
+ "learning_rate": 0.0005851311386940096,
362
+ "loss": 4.5253,
363
+ "step": 2400
364
+ },
365
+ {
366
+ "epoch": 0.2641509433962264,
367
+ "grad_norm": 1.015572428703308,
368
+ "learning_rate": 0.0005848073394495412,
369
+ "loss": 4.5272,
370
+ "step": 2450
371
+ },
372
+ {
373
+ "epoch": 0.2695417789757412,
374
+ "grad_norm": 0.8903120756149292,
375
+ "learning_rate": 0.0005844835402050728,
376
+ "loss": 4.5219,
377
+ "step": 2500
378
+ },
379
+ {
380
+ "epoch": 0.2749326145552561,
381
+ "grad_norm": 1.0123368501663208,
382
+ "learning_rate": 0.0005841597409606044,
383
+ "loss": 4.4772,
384
+ "step": 2550
385
+ },
386
+ {
387
+ "epoch": 0.2803234501347709,
388
+ "grad_norm": 0.925151526927948,
389
+ "learning_rate": 0.000583835941716136,
390
+ "loss": 4.4624,
391
+ "step": 2600
392
+ },
393
+ {
394
+ "epoch": 0.2857142857142857,
395
+ "grad_norm": 1.1478705406188965,
396
+ "learning_rate": 0.0005835121424716675,
397
+ "loss": 4.4605,
398
+ "step": 2650
399
+ },
400
+ {
401
+ "epoch": 0.29110512129380056,
402
+ "grad_norm": 1.0130943059921265,
403
+ "learning_rate": 0.0005831883432271992,
404
+ "loss": 4.4529,
405
+ "step": 2700
406
+ },
407
+ {
408
+ "epoch": 0.29649595687331537,
409
+ "grad_norm": 0.991671621799469,
410
+ "learning_rate": 0.0005828645439827307,
411
+ "loss": 4.4101,
412
+ "step": 2750
413
+ },
414
+ {
415
+ "epoch": 0.3018867924528302,
416
+ "grad_norm": 0.8699747323989868,
417
+ "learning_rate": 0.0005825407447382622,
418
+ "loss": 4.4505,
419
+ "step": 2800
420
+ },
421
+ {
422
+ "epoch": 0.30727762803234504,
423
+ "grad_norm": 1.0381041765213013,
424
+ "learning_rate": 0.0005822169454937938,
425
+ "loss": 4.4171,
426
+ "step": 2850
427
+ },
428
+ {
429
+ "epoch": 0.31266846361185985,
430
+ "grad_norm": 1.0296149253845215,
431
+ "learning_rate": 0.0005818931462493254,
432
+ "loss": 4.4014,
433
+ "step": 2900
434
+ },
435
+ {
436
+ "epoch": 0.31805929919137466,
437
+ "grad_norm": 0.9270951747894287,
438
+ "learning_rate": 0.0005815693470048569,
439
+ "loss": 4.3968,
440
+ "step": 2950
441
+ },
442
+ {
443
+ "epoch": 0.32345013477088946,
444
+ "grad_norm": 0.7531670331954956,
445
+ "learning_rate": 0.0005812455477603885,
446
+ "loss": 4.3901,
447
+ "step": 3000
448
+ },
449
+ {
450
+ "epoch": 0.32345013477088946,
451
+ "eval_accuracy": 0.29159338482103947,
452
+ "eval_loss": 4.296316146850586,
453
+ "eval_runtime": 144.5889,
454
+ "eval_samples_per_second": 124.567,
455
+ "eval_steps_per_second": 7.788,
456
+ "step": 3000
457
+ },
458
+ {
459
+ "epoch": 0.3288409703504043,
460
+ "grad_norm": 0.8791877627372742,
461
+ "learning_rate": 0.0005809217485159201,
462
+ "loss": 4.3683,
463
+ "step": 3050
464
+ },
465
+ {
466
+ "epoch": 0.33423180592991913,
467
+ "grad_norm": 0.843708872795105,
468
+ "learning_rate": 0.0005805979492714517,
469
+ "loss": 4.3643,
470
+ "step": 3100
471
+ },
472
+ {
473
+ "epoch": 0.33962264150943394,
474
+ "grad_norm": 0.8123868703842163,
475
+ "learning_rate": 0.0005802741500269832,
476
+ "loss": 4.3262,
477
+ "step": 3150
478
+ },
479
+ {
480
+ "epoch": 0.3450134770889488,
481
+ "grad_norm": 0.9056026339530945,
482
+ "learning_rate": 0.0005799503507825148,
483
+ "loss": 4.3298,
484
+ "step": 3200
485
+ },
486
+ {
487
+ "epoch": 0.3504043126684636,
488
+ "grad_norm": 0.6567044854164124,
489
+ "learning_rate": 0.0005796265515380463,
490
+ "loss": 4.306,
491
+ "step": 3250
492
+ },
493
+ {
494
+ "epoch": 0.3557951482479784,
495
+ "grad_norm": 0.9095497131347656,
496
+ "learning_rate": 0.000579302752293578,
497
+ "loss": 4.328,
498
+ "step": 3300
499
+ },
500
+ {
501
+ "epoch": 0.3611859838274933,
502
+ "grad_norm": 0.8091392517089844,
503
+ "learning_rate": 0.0005789789530491095,
504
+ "loss": 4.3044,
505
+ "step": 3350
506
+ },
507
+ {
508
+ "epoch": 0.3665768194070081,
509
+ "grad_norm": 0.9594192504882812,
510
+ "learning_rate": 0.0005786551538046411,
511
+ "loss": 4.2963,
512
+ "step": 3400
513
+ },
514
+ {
515
+ "epoch": 0.3719676549865229,
516
+ "grad_norm": 0.72095787525177,
517
+ "learning_rate": 0.0005783313545601726,
518
+ "loss": 4.2802,
519
+ "step": 3450
520
+ },
521
+ {
522
+ "epoch": 0.37735849056603776,
523
+ "grad_norm": 0.7558978199958801,
524
+ "learning_rate": 0.0005780075553157043,
525
+ "loss": 4.2589,
526
+ "step": 3500
527
+ },
528
+ {
529
+ "epoch": 0.38274932614555257,
530
+ "grad_norm": 0.7811341881752014,
531
+ "learning_rate": 0.0005776837560712357,
532
+ "loss": 4.264,
533
+ "step": 3550
534
+ },
535
+ {
536
+ "epoch": 0.3881401617250674,
537
+ "grad_norm": 0.9022195935249329,
538
+ "learning_rate": 0.0005773599568267673,
539
+ "loss": 4.2651,
540
+ "step": 3600
541
+ },
542
+ {
543
+ "epoch": 0.3935309973045822,
544
+ "grad_norm": 0.9639933109283447,
545
+ "learning_rate": 0.0005770361575822989,
546
+ "loss": 4.2789,
547
+ "step": 3650
548
+ },
549
+ {
550
+ "epoch": 0.39892183288409705,
551
+ "grad_norm": 0.9333063960075378,
552
+ "learning_rate": 0.0005767123583378305,
553
+ "loss": 4.24,
554
+ "step": 3700
555
+ },
556
+ {
557
+ "epoch": 0.40431266846361186,
558
+ "grad_norm": 0.7508504986763,
559
+ "learning_rate": 0.000576388559093362,
560
+ "loss": 4.2359,
561
+ "step": 3750
562
+ },
563
+ {
564
+ "epoch": 0.40970350404312667,
565
+ "grad_norm": 0.7458257079124451,
566
+ "learning_rate": 0.0005760647598488936,
567
+ "loss": 4.2555,
568
+ "step": 3800
569
+ },
570
+ {
571
+ "epoch": 0.41509433962264153,
572
+ "grad_norm": 0.7919742465019226,
573
+ "learning_rate": 0.0005757409606044253,
574
+ "loss": 4.2366,
575
+ "step": 3850
576
+ },
577
+ {
578
+ "epoch": 0.42048517520215634,
579
+ "grad_norm": 0.9453123211860657,
580
+ "learning_rate": 0.0005754171613599568,
581
+ "loss": 4.2055,
582
+ "step": 3900
583
+ },
584
+ {
585
+ "epoch": 0.42587601078167114,
586
+ "grad_norm": 0.6152997612953186,
587
+ "learning_rate": 0.0005750933621154884,
588
+ "loss": 4.201,
589
+ "step": 3950
590
+ },
591
+ {
592
+ "epoch": 0.431266846361186,
593
+ "grad_norm": 0.8247600197792053,
594
+ "learning_rate": 0.0005747695628710199,
595
+ "loss": 4.2075,
596
+ "step": 4000
597
+ },
598
+ {
599
+ "epoch": 0.431266846361186,
600
+ "eval_accuracy": 0.30819272110400897,
601
+ "eval_loss": 4.134169101715088,
602
+ "eval_runtime": 144.6112,
603
+ "eval_samples_per_second": 124.548,
604
+ "eval_steps_per_second": 7.786,
605
+ "step": 4000
606
+ },
607
+ {
608
+ "epoch": 0.4366576819407008,
609
+ "grad_norm": 0.8243815898895264,
610
+ "learning_rate": 0.0005744457636265515,
611
+ "loss": 4.2068,
612
+ "step": 4050
613
+ },
614
+ {
615
+ "epoch": 0.4420485175202156,
616
+ "grad_norm": 0.6849672794342041,
617
+ "learning_rate": 0.0005741219643820831,
618
+ "loss": 4.2091,
619
+ "step": 4100
620
+ },
621
+ {
622
+ "epoch": 0.4474393530997305,
623
+ "grad_norm": 0.7505493760108948,
624
+ "learning_rate": 0.0005737981651376146,
625
+ "loss": 4.193,
626
+ "step": 4150
627
+ },
628
+ {
629
+ "epoch": 0.4528301886792453,
630
+ "grad_norm": 0.6466169953346252,
631
+ "learning_rate": 0.0005734743658931462,
632
+ "loss": 4.1829,
633
+ "step": 4200
634
+ },
635
+ {
636
+ "epoch": 0.4582210242587601,
637
+ "grad_norm": 0.5937972664833069,
638
+ "learning_rate": 0.0005731505666486778,
639
+ "loss": 4.1738,
640
+ "step": 4250
641
+ },
642
+ {
643
+ "epoch": 0.4636118598382749,
644
+ "grad_norm": 0.7906216979026794,
645
+ "learning_rate": 0.0005728267674042093,
646
+ "loss": 4.1668,
647
+ "step": 4300
648
+ },
649
+ {
650
+ "epoch": 0.46900269541778977,
651
+ "grad_norm": 0.744433581829071,
652
+ "learning_rate": 0.0005725029681597409,
653
+ "loss": 4.1723,
654
+ "step": 4350
655
+ },
656
+ {
657
+ "epoch": 0.4743935309973046,
658
+ "grad_norm": 0.8073228597640991,
659
+ "learning_rate": 0.0005721791689152725,
660
+ "loss": 4.158,
661
+ "step": 4400
662
+ },
663
+ {
664
+ "epoch": 0.4797843665768194,
665
+ "grad_norm": 0.8262885808944702,
666
+ "learning_rate": 0.0005718553696708041,
667
+ "loss": 4.1602,
668
+ "step": 4450
669
+ },
670
+ {
671
+ "epoch": 0.48517520215633425,
672
+ "grad_norm": 0.6594825387001038,
673
+ "learning_rate": 0.0005715315704263356,
674
+ "loss": 4.1434,
675
+ "step": 4500
676
+ },
677
+ {
678
+ "epoch": 0.49056603773584906,
679
+ "grad_norm": 0.6674824357032776,
680
+ "learning_rate": 0.0005712077711818672,
681
+ "loss": 4.1315,
682
+ "step": 4550
683
+ },
684
+ {
685
+ "epoch": 0.49595687331536387,
686
+ "grad_norm": 0.7260637879371643,
687
+ "learning_rate": 0.0005708839719373987,
688
+ "loss": 4.1407,
689
+ "step": 4600
690
+ },
691
+ {
692
+ "epoch": 0.5013477088948787,
693
+ "grad_norm": 0.6827527284622192,
694
+ "learning_rate": 0.0005705601726929304,
695
+ "loss": 4.1196,
696
+ "step": 4650
697
+ },
698
+ {
699
+ "epoch": 0.5067385444743935,
700
+ "grad_norm": 0.774723470211029,
701
+ "learning_rate": 0.0005702363734484619,
702
+ "loss": 4.1256,
703
+ "step": 4700
704
+ },
705
+ {
706
+ "epoch": 0.5121293800539084,
707
+ "grad_norm": 0.6491437554359436,
708
+ "learning_rate": 0.0005699125742039935,
709
+ "loss": 4.1063,
710
+ "step": 4750
711
+ },
712
+ {
713
+ "epoch": 0.5175202156334232,
714
+ "grad_norm": 0.7277990579605103,
715
+ "learning_rate": 0.000569588774959525,
716
+ "loss": 4.1197,
717
+ "step": 4800
718
+ },
719
+ {
720
+ "epoch": 0.522911051212938,
721
+ "grad_norm": 0.6551647782325745,
722
+ "learning_rate": 0.0005692649757150567,
723
+ "loss": 4.1135,
724
+ "step": 4850
725
+ },
726
+ {
727
+ "epoch": 0.5283018867924528,
728
+ "grad_norm": 0.7088435292243958,
729
+ "learning_rate": 0.0005689411764705881,
730
+ "loss": 4.1188,
731
+ "step": 4900
732
+ },
733
+ {
734
+ "epoch": 0.5336927223719676,
735
+ "grad_norm": 0.7585951089859009,
736
+ "learning_rate": 0.0005686173772261197,
737
+ "loss": 4.0928,
738
+ "step": 4950
739
+ },
740
+ {
741
+ "epoch": 0.5390835579514824,
742
+ "grad_norm": 0.673933744430542,
743
+ "learning_rate": 0.0005682935779816514,
744
+ "loss": 4.0819,
745
+ "step": 5000
746
+ },
747
+ {
748
+ "epoch": 0.5390835579514824,
749
+ "eval_accuracy": 0.31859362744293795,
750
+ "eval_loss": 4.022838115692139,
751
+ "eval_runtime": 144.8217,
752
+ "eval_samples_per_second": 124.367,
753
+ "eval_steps_per_second": 7.775,
754
+ "step": 5000
755
+ },
756
+ {
757
+ "epoch": 0.5444743935309974,
758
+ "grad_norm": 0.617730438709259,
759
+ "learning_rate": 0.0005679697787371829,
760
+ "loss": 4.0793,
761
+ "step": 5050
762
+ },
763
+ {
764
+ "epoch": 0.5498652291105122,
765
+ "grad_norm": 0.6957946419715881,
766
+ "learning_rate": 0.0005676459794927145,
767
+ "loss": 4.0821,
768
+ "step": 5100
769
+ },
770
+ {
771
+ "epoch": 0.555256064690027,
772
+ "grad_norm": 0.6225258708000183,
773
+ "learning_rate": 0.000567322180248246,
774
+ "loss": 4.0873,
775
+ "step": 5150
776
+ },
777
+ {
778
+ "epoch": 0.5606469002695418,
779
+ "grad_norm": 0.6864507794380188,
780
+ "learning_rate": 0.0005669983810037777,
781
+ "loss": 4.0634,
782
+ "step": 5200
783
+ },
784
+ {
785
+ "epoch": 0.5660377358490566,
786
+ "grad_norm": 0.642590343952179,
787
+ "learning_rate": 0.0005666745817593092,
788
+ "loss": 4.0649,
789
+ "step": 5250
790
+ },
791
+ {
792
+ "epoch": 0.5714285714285714,
793
+ "grad_norm": 0.614847719669342,
794
+ "learning_rate": 0.0005663507825148408,
795
+ "loss": 4.0476,
796
+ "step": 5300
797
+ },
798
+ {
799
+ "epoch": 0.5768194070080862,
800
+ "grad_norm": 0.7870457768440247,
801
+ "learning_rate": 0.0005660269832703723,
802
+ "loss": 4.063,
803
+ "step": 5350
804
+ },
805
+ {
806
+ "epoch": 0.5822102425876011,
807
+ "grad_norm": 0.6789854764938354,
808
+ "learning_rate": 0.0005657031840259039,
809
+ "loss": 4.0518,
810
+ "step": 5400
811
+ },
812
+ {
813
+ "epoch": 0.5876010781671159,
814
+ "grad_norm": 0.6867729425430298,
815
+ "learning_rate": 0.0005653793847814355,
816
+ "loss": 4.0614,
817
+ "step": 5450
818
+ },
819
+ {
820
+ "epoch": 0.5929919137466307,
821
+ "grad_norm": 0.5899894833564758,
822
+ "learning_rate": 0.000565055585536967,
823
+ "loss": 4.0639,
824
+ "step": 5500
825
+ },
826
+ {
827
+ "epoch": 0.5983827493261455,
828
+ "grad_norm": 0.6574368476867676,
829
+ "learning_rate": 0.0005647317862924986,
830
+ "loss": 4.067,
831
+ "step": 5550
832
+ },
833
+ {
834
+ "epoch": 0.6037735849056604,
835
+ "grad_norm": 0.7431745529174805,
836
+ "learning_rate": 0.0005644079870480302,
837
+ "loss": 4.0425,
838
+ "step": 5600
839
+ },
840
+ {
841
+ "epoch": 0.6091644204851752,
842
+ "grad_norm": 0.6241595149040222,
843
+ "learning_rate": 0.0005640841878035617,
844
+ "loss": 4.0319,
845
+ "step": 5650
846
+ },
847
+ {
848
+ "epoch": 0.6145552560646901,
849
+ "grad_norm": 0.6736788749694824,
850
+ "learning_rate": 0.0005637603885590933,
851
+ "loss": 4.0366,
852
+ "step": 5700
853
+ },
854
+ {
855
+ "epoch": 0.6199460916442049,
856
+ "grad_norm": 0.6149032711982727,
857
+ "learning_rate": 0.0005634365893146248,
858
+ "loss": 4.0495,
859
+ "step": 5750
860
+ },
861
+ {
862
+ "epoch": 0.6253369272237197,
863
+ "grad_norm": 0.6543477177619934,
864
+ "learning_rate": 0.0005631127900701565,
865
+ "loss": 4.042,
866
+ "step": 5800
867
+ },
868
+ {
869
+ "epoch": 0.6307277628032345,
870
+ "grad_norm": 0.6215724945068359,
871
+ "learning_rate": 0.000562788990825688,
872
+ "loss": 4.0478,
873
+ "step": 5850
874
+ },
875
+ {
876
+ "epoch": 0.6361185983827493,
877
+ "grad_norm": 0.6606348752975464,
878
+ "learning_rate": 0.0005624651915812196,
879
+ "loss": 4.0192,
880
+ "step": 5900
881
+ },
882
+ {
883
+ "epoch": 0.6415094339622641,
884
+ "grad_norm": 0.7944669723510742,
885
+ "learning_rate": 0.0005621413923367511,
886
+ "loss": 4.012,
887
+ "step": 5950
888
+ },
889
+ {
890
+ "epoch": 0.6469002695417789,
891
+ "grad_norm": 0.6075884699821472,
892
+ "learning_rate": 0.0005618175930922828,
893
+ "loss": 4.0201,
894
+ "step": 6000
895
+ },
896
+ {
897
+ "epoch": 0.6469002695417789,
898
+ "eval_accuracy": 0.3257018080166491,
899
+ "eval_loss": 3.940925359725952,
900
+ "eval_runtime": 144.8645,
901
+ "eval_samples_per_second": 124.33,
902
+ "eval_steps_per_second": 7.773,
903
+ "step": 6000
904
+ },
905
+ {
906
+ "epoch": 0.6522911051212938,
907
+ "grad_norm": 0.7395045757293701,
908
+ "learning_rate": 0.0005614937938478143,
909
+ "loss": 3.9927,
910
+ "step": 6050
911
+ },
912
+ {
913
+ "epoch": 0.6576819407008087,
914
+ "grad_norm": 0.69764643907547,
915
+ "learning_rate": 0.0005611699946033459,
916
+ "loss": 4.0252,
917
+ "step": 6100
918
+ },
919
+ {
920
+ "epoch": 0.6630727762803235,
921
+ "grad_norm": 0.5907176733016968,
922
+ "learning_rate": 0.0005608461953588774,
923
+ "loss": 4.0071,
924
+ "step": 6150
925
+ },
926
+ {
927
+ "epoch": 0.6684636118598383,
928
+ "grad_norm": 0.7374498248100281,
929
+ "learning_rate": 0.000560522396114409,
930
+ "loss": 3.9854,
931
+ "step": 6200
932
+ },
933
+ {
934
+ "epoch": 0.6738544474393531,
935
+ "grad_norm": 0.5937222838401794,
936
+ "learning_rate": 0.0005601985968699405,
937
+ "loss": 4.003,
938
+ "step": 6250
939
+ },
940
+ {
941
+ "epoch": 0.6792452830188679,
942
+ "grad_norm": 0.6928643584251404,
943
+ "learning_rate": 0.0005598747976254721,
944
+ "loss": 4.022,
945
+ "step": 6300
946
+ },
947
+ {
948
+ "epoch": 0.6846361185983828,
949
+ "grad_norm": 0.6832301020622253,
950
+ "learning_rate": 0.0005595509983810038,
951
+ "loss": 3.9816,
952
+ "step": 6350
953
+ },
954
+ {
955
+ "epoch": 0.6900269541778976,
956
+ "grad_norm": 0.6227492690086365,
957
+ "learning_rate": 0.0005592271991365353,
958
+ "loss": 3.9855,
959
+ "step": 6400
960
+ },
961
+ {
962
+ "epoch": 0.6954177897574124,
963
+ "grad_norm": 0.6595360636711121,
964
+ "learning_rate": 0.0005589033998920669,
965
+ "loss": 3.9733,
966
+ "step": 6450
967
+ },
968
+ {
969
+ "epoch": 0.7008086253369272,
970
+ "grad_norm": 0.6538481116294861,
971
+ "learning_rate": 0.0005585796006475984,
972
+ "loss": 3.9867,
973
+ "step": 6500
974
+ },
975
+ {
976
+ "epoch": 0.706199460916442,
977
+ "grad_norm": 0.6099511384963989,
978
+ "learning_rate": 0.0005582558014031301,
979
+ "loss": 4.009,
980
+ "step": 6550
981
+ },
982
+ {
983
+ "epoch": 0.7115902964959568,
984
+ "grad_norm": 0.5673043131828308,
985
+ "learning_rate": 0.0005579320021586616,
986
+ "loss": 3.9638,
987
+ "step": 6600
988
+ },
989
+ {
990
+ "epoch": 0.7169811320754716,
991
+ "grad_norm": 0.7762152552604675,
992
+ "learning_rate": 0.0005576082029141932,
993
+ "loss": 3.9942,
994
+ "step": 6650
995
+ },
996
+ {
997
+ "epoch": 0.7223719676549866,
998
+ "grad_norm": 0.6117172241210938,
999
+ "learning_rate": 0.0005572844036697247,
1000
+ "loss": 3.9672,
1001
+ "step": 6700
1002
+ },
1003
+ {
1004
+ "epoch": 0.7277628032345014,
1005
+ "grad_norm": 0.6088191866874695,
1006
+ "learning_rate": 0.0005569606044252563,
1007
+ "loss": 3.9738,
1008
+ "step": 6750
1009
+ },
1010
+ {
1011
+ "epoch": 0.7331536388140162,
1012
+ "grad_norm": 0.6414440274238586,
1013
+ "learning_rate": 0.0005566368051807879,
1014
+ "loss": 3.9641,
1015
+ "step": 6800
1016
+ },
1017
+ {
1018
+ "epoch": 0.738544474393531,
1019
+ "grad_norm": 0.8121737241744995,
1020
+ "learning_rate": 0.0005563130059363194,
1021
+ "loss": 3.9423,
1022
+ "step": 6850
1023
+ },
1024
+ {
1025
+ "epoch": 0.7439353099730458,
1026
+ "grad_norm": 0.7500906586647034,
1027
+ "learning_rate": 0.000555989206691851,
1028
+ "loss": 3.9677,
1029
+ "step": 6900
1030
+ },
1031
+ {
1032
+ "epoch": 0.7493261455525606,
1033
+ "grad_norm": 0.6089574694633484,
1034
+ "learning_rate": 0.0005556654074473826,
1035
+ "loss": 3.9647,
1036
+ "step": 6950
1037
+ },
1038
+ {
1039
+ "epoch": 0.7547169811320755,
1040
+ "grad_norm": 0.7751880288124084,
1041
+ "learning_rate": 0.0005553416082029141,
1042
+ "loss": 3.9519,
1043
+ "step": 7000
1044
+ },
1045
+ {
1046
+ "epoch": 0.7547169811320755,
1047
+ "eval_accuracy": 0.3312385418738994,
1048
+ "eval_loss": 3.88531494140625,
1049
+ "eval_runtime": 144.5963,
1050
+ "eval_samples_per_second": 124.561,
1051
+ "eval_steps_per_second": 7.787,
1052
+ "step": 7000
1053
+ },
1054
+ {
1055
+ "epoch": 0.7601078167115903,
1056
+ "grad_norm": 0.6487019658088684,
1057
+ "learning_rate": 0.0005550178089584457,
1058
+ "loss": 3.9428,
1059
+ "step": 7050
1060
+ },
1061
+ {
1062
+ "epoch": 0.7654986522911051,
1063
+ "grad_norm": 0.6093623638153076,
1064
+ "learning_rate": 0.0005546940097139772,
1065
+ "loss": 3.9544,
1066
+ "step": 7100
1067
+ },
1068
+ {
1069
+ "epoch": 0.77088948787062,
1070
+ "grad_norm": 0.5531768798828125,
1071
+ "learning_rate": 0.0005543702104695089,
1072
+ "loss": 3.9569,
1073
+ "step": 7150
1074
+ },
1075
+ {
1076
+ "epoch": 0.7762803234501348,
1077
+ "grad_norm": 0.6401906609535217,
1078
+ "learning_rate": 0.0005540464112250404,
1079
+ "loss": 3.9564,
1080
+ "step": 7200
1081
+ },
1082
+ {
1083
+ "epoch": 0.7816711590296496,
1084
+ "grad_norm": 0.5921440720558167,
1085
+ "learning_rate": 0.000553722611980572,
1086
+ "loss": 3.9108,
1087
+ "step": 7250
1088
+ },
1089
+ {
1090
+ "epoch": 0.7870619946091644,
1091
+ "grad_norm": 0.6791409254074097,
1092
+ "learning_rate": 0.0005533988127361035,
1093
+ "loss": 3.9488,
1094
+ "step": 7300
1095
+ },
1096
+ {
1097
+ "epoch": 0.7924528301886793,
1098
+ "grad_norm": 0.6472693681716919,
1099
+ "learning_rate": 0.0005530750134916352,
1100
+ "loss": 3.9341,
1101
+ "step": 7350
1102
+ },
1103
+ {
1104
+ "epoch": 0.7978436657681941,
1105
+ "grad_norm": 0.6375269889831543,
1106
+ "learning_rate": 0.0005527512142471668,
1107
+ "loss": 3.9339,
1108
+ "step": 7400
1109
+ },
1110
+ {
1111
+ "epoch": 0.8032345013477089,
1112
+ "grad_norm": 0.626977264881134,
1113
+ "learning_rate": 0.0005524274150026982,
1114
+ "loss": 3.9247,
1115
+ "step": 7450
1116
+ },
1117
+ {
1118
+ "epoch": 0.8086253369272237,
1119
+ "grad_norm": 0.696706235408783,
1120
+ "learning_rate": 0.0005521036157582299,
1121
+ "loss": 3.9116,
1122
+ "step": 7500
1123
+ },
1124
+ {
1125
+ "epoch": 0.8140161725067385,
1126
+ "grad_norm": 0.594398558139801,
1127
+ "learning_rate": 0.0005517798165137614,
1128
+ "loss": 3.9356,
1129
+ "step": 7550
1130
+ },
1131
+ {
1132
+ "epoch": 0.8194070080862533,
1133
+ "grad_norm": 0.6184767484664917,
1134
+ "learning_rate": 0.000551456017269293,
1135
+ "loss": 3.9384,
1136
+ "step": 7600
1137
+ },
1138
+ {
1139
+ "epoch": 0.8247978436657682,
1140
+ "grad_norm": 0.5797574520111084,
1141
+ "learning_rate": 0.0005511322180248245,
1142
+ "loss": 3.9149,
1143
+ "step": 7650
1144
+ },
1145
+ {
1146
+ "epoch": 0.8301886792452831,
1147
+ "grad_norm": 0.5616925954818726,
1148
+ "learning_rate": 0.0005508084187803562,
1149
+ "loss": 3.9173,
1150
+ "step": 7700
1151
+ },
1152
+ {
1153
+ "epoch": 0.8355795148247979,
1154
+ "grad_norm": 0.6098619103431702,
1155
+ "learning_rate": 0.0005504846195358877,
1156
+ "loss": 3.8999,
1157
+ "step": 7750
1158
+ },
1159
+ {
1160
+ "epoch": 0.8409703504043127,
1161
+ "grad_norm": 0.6354513764381409,
1162
+ "learning_rate": 0.0005501608202914193,
1163
+ "loss": 3.9284,
1164
+ "step": 7800
1165
+ },
1166
+ {
1167
+ "epoch": 0.8463611859838275,
1168
+ "grad_norm": 0.6673269867897034,
1169
+ "learning_rate": 0.0005498370210469508,
1170
+ "loss": 3.909,
1171
+ "step": 7850
1172
+ },
1173
+ {
1174
+ "epoch": 0.8517520215633423,
1175
+ "grad_norm": 0.539669930934906,
1176
+ "learning_rate": 0.0005495132218024824,
1177
+ "loss": 3.9074,
1178
+ "step": 7900
1179
+ },
1180
+ {
1181
+ "epoch": 0.8571428571428571,
1182
+ "grad_norm": 0.618360161781311,
1183
+ "learning_rate": 0.000549189422558014,
1184
+ "loss": 3.9046,
1185
+ "step": 7950
1186
+ },
1187
+ {
1188
+ "epoch": 0.862533692722372,
1189
+ "grad_norm": 0.6503387689590454,
1190
+ "learning_rate": 0.0005488656233135456,
1191
+ "loss": 3.8917,
1192
+ "step": 8000
1193
+ },
1194
+ {
1195
+ "epoch": 0.862533692722372,
1196
+ "eval_accuracy": 0.33594255999893957,
1197
+ "eval_loss": 3.8360989093780518,
1198
+ "eval_runtime": 144.8786,
1199
+ "eval_samples_per_second": 124.318,
1200
+ "eval_steps_per_second": 7.772,
1201
+ "step": 8000
1202
+ },
1203
+ {
1204
+ "epoch": 0.8679245283018868,
1205
+ "grad_norm": 0.6546068787574768,
1206
+ "learning_rate": 0.0005485418240690771,
1207
+ "loss": 3.9022,
1208
+ "step": 8050
1209
+ },
1210
+ {
1211
+ "epoch": 0.8733153638814016,
1212
+ "grad_norm": 0.6453102231025696,
1213
+ "learning_rate": 0.0005482180248246087,
1214
+ "loss": 3.8843,
1215
+ "step": 8100
1216
+ },
1217
+ {
1218
+ "epoch": 0.8787061994609164,
1219
+ "grad_norm": 0.5534443259239197,
1220
+ "learning_rate": 0.0005478942255801403,
1221
+ "loss": 3.8988,
1222
+ "step": 8150
1223
+ },
1224
+ {
1225
+ "epoch": 0.8840970350404312,
1226
+ "grad_norm": 0.6230263113975525,
1227
+ "learning_rate": 0.0005475704263356718,
1228
+ "loss": 3.8879,
1229
+ "step": 8200
1230
+ },
1231
+ {
1232
+ "epoch": 0.889487870619946,
1233
+ "grad_norm": 0.6733997464179993,
1234
+ "learning_rate": 0.0005472466270912034,
1235
+ "loss": 3.8729,
1236
+ "step": 8250
1237
+ },
1238
+ {
1239
+ "epoch": 0.894878706199461,
1240
+ "grad_norm": 0.6633635759353638,
1241
+ "learning_rate": 0.000546922827846735,
1242
+ "loss": 3.8991,
1243
+ "step": 8300
1244
+ },
1245
+ {
1246
+ "epoch": 0.9002695417789758,
1247
+ "grad_norm": 0.6503227949142456,
1248
+ "learning_rate": 0.0005465990286022665,
1249
+ "loss": 3.8792,
1250
+ "step": 8350
1251
+ },
1252
+ {
1253
+ "epoch": 0.9056603773584906,
1254
+ "grad_norm": 0.7666671276092529,
1255
+ "learning_rate": 0.0005462752293577981,
1256
+ "loss": 3.8944,
1257
+ "step": 8400
1258
+ },
1259
+ {
1260
+ "epoch": 0.9110512129380054,
1261
+ "grad_norm": 0.6036889553070068,
1262
+ "learning_rate": 0.0005459514301133296,
1263
+ "loss": 3.9057,
1264
+ "step": 8450
1265
+ },
1266
+ {
1267
+ "epoch": 0.9164420485175202,
1268
+ "grad_norm": 0.6154916882514954,
1269
+ "learning_rate": 0.0005456276308688613,
1270
+ "loss": 3.894,
1271
+ "step": 8500
1272
+ },
1273
+ {
1274
+ "epoch": 0.921832884097035,
1275
+ "grad_norm": 0.5653623938560486,
1276
+ "learning_rate": 0.0005453038316243929,
1277
+ "loss": 3.8763,
1278
+ "step": 8550
1279
+ },
1280
+ {
1281
+ "epoch": 0.9272237196765498,
1282
+ "grad_norm": 0.5695276856422424,
1283
+ "learning_rate": 0.0005449800323799244,
1284
+ "loss": 3.8709,
1285
+ "step": 8600
1286
+ },
1287
+ {
1288
+ "epoch": 0.9326145552560647,
1289
+ "grad_norm": 0.5406414270401001,
1290
+ "learning_rate": 0.000544656233135456,
1291
+ "loss": 3.8623,
1292
+ "step": 8650
1293
+ },
1294
+ {
1295
+ "epoch": 0.9380053908355795,
1296
+ "grad_norm": 0.5876409411430359,
1297
+ "learning_rate": 0.0005443324338909875,
1298
+ "loss": 3.867,
1299
+ "step": 8700
1300
+ },
1301
+ {
1302
+ "epoch": 0.9433962264150944,
1303
+ "grad_norm": 0.5680667757987976,
1304
+ "learning_rate": 0.0005440086346465192,
1305
+ "loss": 3.8691,
1306
+ "step": 8750
1307
+ },
1308
+ {
1309
+ "epoch": 0.9487870619946092,
1310
+ "grad_norm": 0.5708035230636597,
1311
+ "learning_rate": 0.0005436848354020506,
1312
+ "loss": 3.8589,
1313
+ "step": 8800
1314
+ },
1315
+ {
1316
+ "epoch": 0.954177897574124,
1317
+ "grad_norm": 0.5115900635719299,
1318
+ "learning_rate": 0.0005433610361575823,
1319
+ "loss": 3.8488,
1320
+ "step": 8850
1321
+ },
1322
+ {
1323
+ "epoch": 0.9595687331536388,
1324
+ "grad_norm": 0.6006278395652771,
1325
+ "learning_rate": 0.0005430372369131138,
1326
+ "loss": 3.8652,
1327
+ "step": 8900
1328
+ },
1329
+ {
1330
+ "epoch": 0.9649595687331537,
1331
+ "grad_norm": 0.6494414210319519,
1332
+ "learning_rate": 0.0005427134376686454,
1333
+ "loss": 3.8716,
1334
+ "step": 8950
1335
+ },
1336
+ {
1337
+ "epoch": 0.9703504043126685,
1338
+ "grad_norm": 0.5888278484344482,
1339
+ "learning_rate": 0.0005423896384241769,
1340
+ "loss": 3.8592,
1341
+ "step": 9000
1342
+ },
1343
+ {
1344
+ "epoch": 0.9703504043126685,
1345
+ "eval_accuracy": 0.34041004078720755,
1346
+ "eval_loss": 3.7949342727661133,
1347
+ "eval_runtime": 144.616,
1348
+ "eval_samples_per_second": 124.544,
1349
+ "eval_steps_per_second": 7.786,
1350
+ "step": 9000
1351
+ },
1352
+ {
1353
+ "epoch": 0.9757412398921833,
1354
+ "grad_norm": 0.5697623491287231,
1355
+ "learning_rate": 0.0005420658391797086,
1356
+ "loss": 3.855,
1357
+ "step": 9050
1358
+ },
1359
+ {
1360
+ "epoch": 0.9811320754716981,
1361
+ "grad_norm": 0.6447634696960449,
1362
+ "learning_rate": 0.0005417420399352401,
1363
+ "loss": 3.866,
1364
+ "step": 9100
1365
+ },
1366
+ {
1367
+ "epoch": 0.9865229110512129,
1368
+ "grad_norm": 0.583088219165802,
1369
+ "learning_rate": 0.0005414182406907717,
1370
+ "loss": 3.8415,
1371
+ "step": 9150
1372
+ },
1373
+ {
1374
+ "epoch": 0.9919137466307277,
1375
+ "grad_norm": 0.5796465277671814,
1376
+ "learning_rate": 0.0005410944414463032,
1377
+ "loss": 3.8376,
1378
+ "step": 9200
1379
+ },
1380
+ {
1381
+ "epoch": 0.9973045822102425,
1382
+ "grad_norm": 0.6316399574279785,
1383
+ "learning_rate": 0.0005407706422018348,
1384
+ "loss": 3.8524,
1385
+ "step": 9250
1386
+ },
1387
+ {
1388
+ "epoch": 1.0026954177897573,
1389
+ "grad_norm": 0.5727218389511108,
1390
+ "learning_rate": 0.0005404468429573664,
1391
+ "loss": 3.8268,
1392
+ "step": 9300
1393
+ },
1394
+ {
1395
+ "epoch": 1.0080862533692723,
1396
+ "grad_norm": 0.5713456869125366,
1397
+ "learning_rate": 0.000540123043712898,
1398
+ "loss": 3.7921,
1399
+ "step": 9350
1400
+ },
1401
+ {
1402
+ "epoch": 1.013477088948787,
1403
+ "grad_norm": 0.6198617815971375,
1404
+ "learning_rate": 0.0005397992444684295,
1405
+ "loss": 3.7876,
1406
+ "step": 9400
1407
+ },
1408
+ {
1409
+ "epoch": 1.0188679245283019,
1410
+ "grad_norm": 0.5770072937011719,
1411
+ "learning_rate": 0.0005394754452239611,
1412
+ "loss": 3.7856,
1413
+ "step": 9450
1414
+ },
1415
+ {
1416
+ "epoch": 1.0242587601078168,
1417
+ "grad_norm": 0.5920796990394592,
1418
+ "learning_rate": 0.0005391516459794927,
1419
+ "loss": 3.7855,
1420
+ "step": 9500
1421
+ },
1422
+ {
1423
+ "epoch": 1.0296495956873315,
1424
+ "grad_norm": 0.5636431574821472,
1425
+ "learning_rate": 0.0005388278467350242,
1426
+ "loss": 3.7857,
1427
+ "step": 9550
1428
+ },
1429
+ {
1430
+ "epoch": 1.0350404312668464,
1431
+ "grad_norm": 0.669791579246521,
1432
+ "learning_rate": 0.0005385040474905557,
1433
+ "loss": 3.7723,
1434
+ "step": 9600
1435
+ },
1436
+ {
1437
+ "epoch": 1.0404312668463611,
1438
+ "grad_norm": 0.6635991930961609,
1439
+ "learning_rate": 0.0005381802482460874,
1440
+ "loss": 3.8142,
1441
+ "step": 9650
1442
+ },
1443
+ {
1444
+ "epoch": 1.045822102425876,
1445
+ "grad_norm": 0.601437509059906,
1446
+ "learning_rate": 0.000537856449001619,
1447
+ "loss": 3.798,
1448
+ "step": 9700
1449
+ },
1450
+ {
1451
+ "epoch": 1.0512129380053907,
1452
+ "grad_norm": 0.5584002137184143,
1453
+ "learning_rate": 0.0005375326497571505,
1454
+ "loss": 3.7985,
1455
+ "step": 9750
1456
+ },
1457
+ {
1458
+ "epoch": 1.0566037735849056,
1459
+ "grad_norm": 0.5526735782623291,
1460
+ "learning_rate": 0.000537208850512682,
1461
+ "loss": 3.8074,
1462
+ "step": 9800
1463
+ },
1464
+ {
1465
+ "epoch": 1.0619946091644206,
1466
+ "grad_norm": 0.5977774858474731,
1467
+ "learning_rate": 0.0005368850512682137,
1468
+ "loss": 3.7989,
1469
+ "step": 9850
1470
+ },
1471
+ {
1472
+ "epoch": 1.0673854447439353,
1473
+ "grad_norm": 0.5950897932052612,
1474
+ "learning_rate": 0.0005365612520237453,
1475
+ "loss": 3.7924,
1476
+ "step": 9900
1477
+ },
1478
+ {
1479
+ "epoch": 1.0727762803234502,
1480
+ "grad_norm": 0.5768362283706665,
1481
+ "learning_rate": 0.0005362374527792768,
1482
+ "loss": 3.7855,
1483
+ "step": 9950
1484
+ },
1485
+ {
1486
+ "epoch": 1.0781671159029649,
1487
+ "grad_norm": 0.6516294479370117,
1488
+ "learning_rate": 0.0005359136535348084,
1489
+ "loss": 3.7811,
1490
+ "step": 10000
1491
+ },
1492
+ {
1493
+ "epoch": 1.0781671159029649,
1494
+ "eval_accuracy": 0.34365658907510427,
1495
+ "eval_loss": 3.762051582336426,
1496
+ "eval_runtime": 144.5563,
1497
+ "eval_samples_per_second": 124.595,
1498
+ "eval_steps_per_second": 7.789,
1499
+ "step": 10000
1500
+ },
1501
+ {
1502
+ "epoch": 1.0835579514824798,
1503
+ "grad_norm": 0.54844731092453,
1504
+ "learning_rate": 0.0005355898542903399,
1505
+ "loss": 3.7802,
1506
+ "step": 10050
1507
+ },
1508
+ {
1509
+ "epoch": 1.0889487870619945,
1510
+ "grad_norm": 0.6120631694793701,
1511
+ "learning_rate": 0.0005352660550458716,
1512
+ "loss": 3.789,
1513
+ "step": 10100
1514
+ },
1515
+ {
1516
+ "epoch": 1.0943396226415094,
1517
+ "grad_norm": 0.5617305636405945,
1518
+ "learning_rate": 0.000534942255801403,
1519
+ "loss": 3.784,
1520
+ "step": 10150
1521
+ },
1522
+ {
1523
+ "epoch": 1.0997304582210243,
1524
+ "grad_norm": 0.6198379993438721,
1525
+ "learning_rate": 0.0005346184565569347,
1526
+ "loss": 3.7835,
1527
+ "step": 10200
1528
+ },
1529
+ {
1530
+ "epoch": 1.105121293800539,
1531
+ "grad_norm": 0.5185474753379822,
1532
+ "learning_rate": 0.0005342946573124662,
1533
+ "loss": 3.7928,
1534
+ "step": 10250
1535
+ },
1536
+ {
1537
+ "epoch": 1.110512129380054,
1538
+ "grad_norm": 0.5625671744346619,
1539
+ "learning_rate": 0.0005339708580679978,
1540
+ "loss": 3.7792,
1541
+ "step": 10300
1542
+ },
1543
+ {
1544
+ "epoch": 1.1159029649595686,
1545
+ "grad_norm": 0.6791179180145264,
1546
+ "learning_rate": 0.0005336470588235293,
1547
+ "loss": 3.7647,
1548
+ "step": 10350
1549
+ },
1550
+ {
1551
+ "epoch": 1.1212938005390836,
1552
+ "grad_norm": 0.5952463746070862,
1553
+ "learning_rate": 0.000533323259579061,
1554
+ "loss": 3.7826,
1555
+ "step": 10400
1556
+ },
1557
+ {
1558
+ "epoch": 1.1266846361185983,
1559
+ "grad_norm": 0.5367695689201355,
1560
+ "learning_rate": 0.0005329994603345925,
1561
+ "loss": 3.7714,
1562
+ "step": 10450
1563
+ },
1564
+ {
1565
+ "epoch": 1.1320754716981132,
1566
+ "grad_norm": 0.6577426195144653,
1567
+ "learning_rate": 0.0005326756610901241,
1568
+ "loss": 3.7588,
1569
+ "step": 10500
1570
+ },
1571
+ {
1572
+ "epoch": 1.137466307277628,
1573
+ "grad_norm": 0.5481351017951965,
1574
+ "learning_rate": 0.0005323518618456556,
1575
+ "loss": 3.7808,
1576
+ "step": 10550
1577
+ },
1578
+ {
1579
+ "epoch": 1.1428571428571428,
1580
+ "grad_norm": 0.5973223447799683,
1581
+ "learning_rate": 0.0005320280626011872,
1582
+ "loss": 3.7903,
1583
+ "step": 10600
1584
+ },
1585
+ {
1586
+ "epoch": 1.1482479784366577,
1587
+ "grad_norm": 0.6764339804649353,
1588
+ "learning_rate": 0.0005317042633567188,
1589
+ "loss": 3.7673,
1590
+ "step": 10650
1591
+ },
1592
+ {
1593
+ "epoch": 1.1536388140161726,
1594
+ "grad_norm": 0.6202555894851685,
1595
+ "learning_rate": 0.0005313804641122504,
1596
+ "loss": 3.7633,
1597
+ "step": 10700
1598
+ },
1599
+ {
1600
+ "epoch": 1.1590296495956873,
1601
+ "grad_norm": 0.6169992089271545,
1602
+ "learning_rate": 0.0005310566648677819,
1603
+ "loss": 3.7519,
1604
+ "step": 10750
1605
+ },
1606
+ {
1607
+ "epoch": 1.1644204851752022,
1608
+ "grad_norm": 0.5893779993057251,
1609
+ "learning_rate": 0.0005307328656233135,
1610
+ "loss": 3.7433,
1611
+ "step": 10800
1612
+ },
1613
+ {
1614
+ "epoch": 1.169811320754717,
1615
+ "grad_norm": 0.5708191394805908,
1616
+ "learning_rate": 0.000530409066378845,
1617
+ "loss": 3.7609,
1618
+ "step": 10850
1619
+ },
1620
+ {
1621
+ "epoch": 1.1752021563342319,
1622
+ "grad_norm": 0.5436216592788696,
1623
+ "learning_rate": 0.0005300852671343766,
1624
+ "loss": 3.7635,
1625
+ "step": 10900
1626
+ },
1627
+ {
1628
+ "epoch": 1.1805929919137466,
1629
+ "grad_norm": 0.5945119261741638,
1630
+ "learning_rate": 0.0005297614678899081,
1631
+ "loss": 3.7755,
1632
+ "step": 10950
1633
+ },
1634
+ {
1635
+ "epoch": 1.1859838274932615,
1636
+ "grad_norm": 0.5752778053283691,
1637
+ "learning_rate": 0.0005294376686454398,
1638
+ "loss": 3.7908,
1639
+ "step": 11000
1640
+ },
1641
+ {
1642
+ "epoch": 1.1859838274932615,
1643
+ "eval_accuracy": 0.3459605735612103,
1644
+ "eval_loss": 3.7342617511749268,
1645
+ "eval_runtime": 145.0651,
1646
+ "eval_samples_per_second": 124.158,
1647
+ "eval_steps_per_second": 7.762,
1648
+ "step": 11000
1649
+ },
1650
+ {
1651
+ "epoch": 1.1913746630727764,
1652
+ "grad_norm": 0.5780414342880249,
1653
+ "learning_rate": 0.0005291138694009714,
1654
+ "loss": 3.7773,
1655
+ "step": 11050
1656
+ },
1657
+ {
1658
+ "epoch": 1.196765498652291,
1659
+ "grad_norm": 0.6393083930015564,
1660
+ "learning_rate": 0.0005287900701565029,
1661
+ "loss": 3.7719,
1662
+ "step": 11100
1663
+ },
1664
+ {
1665
+ "epoch": 1.202156334231806,
1666
+ "grad_norm": 0.5772005319595337,
1667
+ "learning_rate": 0.0005284662709120345,
1668
+ "loss": 3.7688,
1669
+ "step": 11150
1670
+ },
1671
+ {
1672
+ "epoch": 1.2075471698113207,
1673
+ "grad_norm": 0.5036531686782837,
1674
+ "learning_rate": 0.0005281424716675661,
1675
+ "loss": 3.7578,
1676
+ "step": 11200
1677
+ },
1678
+ {
1679
+ "epoch": 1.2129380053908356,
1680
+ "grad_norm": 0.56954026222229,
1681
+ "learning_rate": 0.0005278186724230977,
1682
+ "loss": 3.7505,
1683
+ "step": 11250
1684
+ },
1685
+ {
1686
+ "epoch": 1.2183288409703503,
1687
+ "grad_norm": 0.6294519901275635,
1688
+ "learning_rate": 0.0005274948731786292,
1689
+ "loss": 3.7672,
1690
+ "step": 11300
1691
+ },
1692
+ {
1693
+ "epoch": 1.2237196765498652,
1694
+ "grad_norm": 0.6276326775550842,
1695
+ "learning_rate": 0.0005271710739341608,
1696
+ "loss": 3.7523,
1697
+ "step": 11350
1698
+ },
1699
+ {
1700
+ "epoch": 1.2291105121293802,
1701
+ "grad_norm": 0.6219097971916199,
1702
+ "learning_rate": 0.0005268472746896923,
1703
+ "loss": 3.7687,
1704
+ "step": 11400
1705
+ },
1706
+ {
1707
+ "epoch": 1.2345013477088949,
1708
+ "grad_norm": 0.5939977765083313,
1709
+ "learning_rate": 0.000526523475445224,
1710
+ "loss": 3.7595,
1711
+ "step": 11450
1712
+ },
1713
+ {
1714
+ "epoch": 1.2398921832884098,
1715
+ "grad_norm": 0.6375150680541992,
1716
+ "learning_rate": 0.0005261996762007554,
1717
+ "loss": 3.7498,
1718
+ "step": 11500
1719
+ },
1720
+ {
1721
+ "epoch": 1.2452830188679245,
1722
+ "grad_norm": 0.534351110458374,
1723
+ "learning_rate": 0.0005258758769562871,
1724
+ "loss": 3.7602,
1725
+ "step": 11550
1726
+ },
1727
+ {
1728
+ "epoch": 1.2506738544474394,
1729
+ "grad_norm": 0.5753054618835449,
1730
+ "learning_rate": 0.0005255520777118186,
1731
+ "loss": 3.7345,
1732
+ "step": 11600
1733
+ },
1734
+ {
1735
+ "epoch": 1.256064690026954,
1736
+ "grad_norm": 0.5968658328056335,
1737
+ "learning_rate": 0.0005252282784673502,
1738
+ "loss": 3.7679,
1739
+ "step": 11650
1740
+ },
1741
+ {
1742
+ "epoch": 1.261455525606469,
1743
+ "grad_norm": 0.6453511118888855,
1744
+ "learning_rate": 0.0005249044792228817,
1745
+ "loss": 3.7641,
1746
+ "step": 11700
1747
+ },
1748
+ {
1749
+ "epoch": 1.266846361185984,
1750
+ "grad_norm": 0.5844177007675171,
1751
+ "learning_rate": 0.0005245806799784133,
1752
+ "loss": 3.7551,
1753
+ "step": 11750
1754
+ },
1755
+ {
1756
+ "epoch": 1.2722371967654986,
1757
+ "grad_norm": 0.5597920417785645,
1758
+ "learning_rate": 0.0005242568807339449,
1759
+ "loss": 3.7458,
1760
+ "step": 11800
1761
+ },
1762
+ {
1763
+ "epoch": 1.2776280323450135,
1764
+ "grad_norm": 0.6149951815605164,
1765
+ "learning_rate": 0.0005239330814894765,
1766
+ "loss": 3.7582,
1767
+ "step": 11850
1768
+ },
1769
+ {
1770
+ "epoch": 1.2830188679245282,
1771
+ "grad_norm": 0.5561873316764832,
1772
+ "learning_rate": 0.000523609282245008,
1773
+ "loss": 3.7456,
1774
+ "step": 11900
1775
+ },
1776
+ {
1777
+ "epoch": 1.2884097035040432,
1778
+ "grad_norm": 0.6077041029930115,
1779
+ "learning_rate": 0.0005232854830005396,
1780
+ "loss": 3.7601,
1781
+ "step": 11950
1782
+ },
1783
+ {
1784
+ "epoch": 1.2938005390835579,
1785
+ "grad_norm": 0.6389844417572021,
1786
+ "learning_rate": 0.0005229616837560712,
1787
+ "loss": 3.7514,
1788
+ "step": 12000
1789
+ },
1790
+ {
1791
+ "epoch": 1.2938005390835579,
1792
+ "eval_accuracy": 0.3482391332715678,
1793
+ "eval_loss": 3.7092418670654297,
1794
+ "eval_runtime": 144.795,
1795
+ "eval_samples_per_second": 124.39,
1796
+ "eval_steps_per_second": 7.777,
1797
+ "step": 12000
1798
+ },
1799
+ {
1800
+ "epoch": 1.2991913746630728,
1801
+ "grad_norm": 0.6131287217140198,
1802
+ "learning_rate": 0.0005226378845116028,
1803
+ "loss": 3.7472,
1804
+ "step": 12050
1805
+ },
1806
+ {
1807
+ "epoch": 1.3045822102425877,
1808
+ "grad_norm": 0.5687003135681152,
1809
+ "learning_rate": 0.0005223140852671344,
1810
+ "loss": 3.7541,
1811
+ "step": 12100
1812
+ },
1813
+ {
1814
+ "epoch": 1.3099730458221024,
1815
+ "grad_norm": 0.603330135345459,
1816
+ "learning_rate": 0.0005219902860226659,
1817
+ "loss": 3.7719,
1818
+ "step": 12150
1819
+ },
1820
+ {
1821
+ "epoch": 1.3153638814016173,
1822
+ "grad_norm": 0.5906956791877747,
1823
+ "learning_rate": 0.0005216664867781975,
1824
+ "loss": 3.7419,
1825
+ "step": 12200
1826
+ },
1827
+ {
1828
+ "epoch": 1.320754716981132,
1829
+ "grad_norm": 0.5257678627967834,
1830
+ "learning_rate": 0.000521342687533729,
1831
+ "loss": 3.7494,
1832
+ "step": 12250
1833
+ },
1834
+ {
1835
+ "epoch": 1.326145552560647,
1836
+ "grad_norm": 0.55323725938797,
1837
+ "learning_rate": 0.0005210188882892606,
1838
+ "loss": 3.7598,
1839
+ "step": 12300
1840
+ },
1841
+ {
1842
+ "epoch": 1.3315363881401616,
1843
+ "grad_norm": 0.6195726990699768,
1844
+ "learning_rate": 0.0005206950890447922,
1845
+ "loss": 3.7455,
1846
+ "step": 12350
1847
+ },
1848
+ {
1849
+ "epoch": 1.3369272237196765,
1850
+ "grad_norm": 0.6453518271446228,
1851
+ "learning_rate": 0.0005203712898003238,
1852
+ "loss": 3.7346,
1853
+ "step": 12400
1854
+ },
1855
+ {
1856
+ "epoch": 1.3423180592991915,
1857
+ "grad_norm": 0.591871976852417,
1858
+ "learning_rate": 0.0005200474905558553,
1859
+ "loss": 3.7498,
1860
+ "step": 12450
1861
+ },
1862
+ {
1863
+ "epoch": 1.3477088948787062,
1864
+ "grad_norm": 0.8153529167175293,
1865
+ "learning_rate": 0.0005197236913113869,
1866
+ "loss": 3.73,
1867
+ "step": 12500
1868
+ },
1869
+ {
1870
+ "epoch": 1.353099730458221,
1871
+ "grad_norm": 0.6054574847221375,
1872
+ "learning_rate": 0.0005193998920669184,
1873
+ "loss": 3.7438,
1874
+ "step": 12550
1875
+ },
1876
+ {
1877
+ "epoch": 1.3584905660377358,
1878
+ "grad_norm": 0.5890757441520691,
1879
+ "learning_rate": 0.0005190760928224501,
1880
+ "loss": 3.7383,
1881
+ "step": 12600
1882
+ },
1883
+ {
1884
+ "epoch": 1.3638814016172507,
1885
+ "grad_norm": 0.5858767032623291,
1886
+ "learning_rate": 0.0005187522935779816,
1887
+ "loss": 3.7272,
1888
+ "step": 12650
1889
+ },
1890
+ {
1891
+ "epoch": 1.3692722371967654,
1892
+ "grad_norm": 0.5663976073265076,
1893
+ "learning_rate": 0.0005184284943335132,
1894
+ "loss": 3.7232,
1895
+ "step": 12700
1896
+ },
1897
+ {
1898
+ "epoch": 1.3746630727762803,
1899
+ "grad_norm": 0.6173025965690613,
1900
+ "learning_rate": 0.0005181046950890447,
1901
+ "loss": 3.7428,
1902
+ "step": 12750
1903
+ },
1904
+ {
1905
+ "epoch": 1.3800539083557952,
1906
+ "grad_norm": 0.5534438490867615,
1907
+ "learning_rate": 0.0005177808958445764,
1908
+ "loss": 3.7214,
1909
+ "step": 12800
1910
+ },
1911
+ {
1912
+ "epoch": 1.38544474393531,
1913
+ "grad_norm": 0.6565149426460266,
1914
+ "learning_rate": 0.0005174570966001078,
1915
+ "loss": 3.7343,
1916
+ "step": 12850
1917
+ },
1918
+ {
1919
+ "epoch": 1.3908355795148248,
1920
+ "grad_norm": 0.518814742565155,
1921
+ "learning_rate": 0.0005171332973556395,
1922
+ "loss": 3.7343,
1923
+ "step": 12900
1924
+ },
1925
+ {
1926
+ "epoch": 1.3962264150943398,
1927
+ "grad_norm": 0.6175557374954224,
1928
+ "learning_rate": 0.000516809498111171,
1929
+ "loss": 3.726,
1930
+ "step": 12950
1931
+ },
1932
+ {
1933
+ "epoch": 1.4016172506738545,
1934
+ "grad_norm": 0.525554895401001,
1935
+ "learning_rate": 0.0005164856988667026,
1936
+ "loss": 3.7207,
1937
+ "step": 13000
1938
+ },
1939
+ {
1940
+ "epoch": 1.4016172506738545,
1941
+ "eval_accuracy": 0.3513273743019188,
1942
+ "eval_loss": 3.684098482131958,
1943
+ "eval_runtime": 145.0832,
1944
+ "eval_samples_per_second": 124.143,
1945
+ "eval_steps_per_second": 7.761,
1946
+ "step": 13000
1947
+ },
1948
+ {
1949
+ "epoch": 1.4070080862533692,
1950
+ "grad_norm": 0.5694339275360107,
1951
+ "learning_rate": 0.0005161618996222341,
1952
+ "loss": 3.7017,
1953
+ "step": 13050
1954
+ },
1955
+ {
1956
+ "epoch": 1.412398921832884,
1957
+ "grad_norm": 0.5720424056053162,
1958
+ "learning_rate": 0.0005158381003777657,
1959
+ "loss": 3.7224,
1960
+ "step": 13100
1961
+ },
1962
+ {
1963
+ "epoch": 1.417789757412399,
1964
+ "grad_norm": 0.5959078073501587,
1965
+ "learning_rate": 0.0005155143011332973,
1966
+ "loss": 3.7268,
1967
+ "step": 13150
1968
+ },
1969
+ {
1970
+ "epoch": 1.4231805929919137,
1971
+ "grad_norm": 0.5883017182350159,
1972
+ "learning_rate": 0.0005151905018888289,
1973
+ "loss": 3.7077,
1974
+ "step": 13200
1975
+ },
1976
+ {
1977
+ "epoch": 1.4285714285714286,
1978
+ "grad_norm": 0.5214637517929077,
1979
+ "learning_rate": 0.0005148667026443604,
1980
+ "loss": 3.7251,
1981
+ "step": 13250
1982
+ },
1983
+ {
1984
+ "epoch": 1.4339622641509435,
1985
+ "grad_norm": 0.5803607106208801,
1986
+ "learning_rate": 0.000514542903399892,
1987
+ "loss": 3.7207,
1988
+ "step": 13300
1989
+ },
1990
+ {
1991
+ "epoch": 1.4393530997304582,
1992
+ "grad_norm": 0.6287586688995361,
1993
+ "learning_rate": 0.0005142191041554237,
1994
+ "loss": 3.7256,
1995
+ "step": 13350
1996
+ },
1997
+ {
1998
+ "epoch": 1.444743935309973,
1999
+ "grad_norm": 0.614562451839447,
2000
+ "learning_rate": 0.0005138953049109552,
2001
+ "loss": 3.7166,
2002
+ "step": 13400
2003
+ },
2004
+ {
2005
+ "epoch": 1.4501347708894878,
2006
+ "grad_norm": 0.5314472317695618,
2007
+ "learning_rate": 0.0005135715056664868,
2008
+ "loss": 3.7338,
2009
+ "step": 13450
2010
+ },
2011
+ {
2012
+ "epoch": 1.4555256064690028,
2013
+ "grad_norm": 0.6482129693031311,
2014
+ "learning_rate": 0.0005132477064220183,
2015
+ "loss": 3.7264,
2016
+ "step": 13500
2017
+ },
2018
+ {
2019
+ "epoch": 1.4609164420485174,
2020
+ "grad_norm": 0.5918267369270325,
2021
+ "learning_rate": 0.0005129239071775499,
2022
+ "loss": 3.7023,
2023
+ "step": 13550
2024
+ },
2025
+ {
2026
+ "epoch": 1.4663072776280324,
2027
+ "grad_norm": 0.5850944519042969,
2028
+ "learning_rate": 0.0005126001079330814,
2029
+ "loss": 3.7298,
2030
+ "step": 13600
2031
+ },
2032
+ {
2033
+ "epoch": 1.4716981132075473,
2034
+ "grad_norm": 0.5356786847114563,
2035
+ "learning_rate": 0.000512276308688613,
2036
+ "loss": 3.7098,
2037
+ "step": 13650
2038
+ },
2039
+ {
2040
+ "epoch": 1.477088948787062,
2041
+ "grad_norm": 0.5910783410072327,
2042
+ "learning_rate": 0.0005119525094441446,
2043
+ "loss": 3.7309,
2044
+ "step": 13700
2045
+ },
2046
+ {
2047
+ "epoch": 1.482479784366577,
2048
+ "grad_norm": 0.5758869051933289,
2049
+ "learning_rate": 0.0005116287101996762,
2050
+ "loss": 3.703,
2051
+ "step": 13750
2052
+ },
2053
+ {
2054
+ "epoch": 1.4878706199460916,
2055
+ "grad_norm": 0.5499842762947083,
2056
+ "learning_rate": 0.0005113049109552077,
2057
+ "loss": 3.7275,
2058
+ "step": 13800
2059
+ },
2060
+ {
2061
+ "epoch": 1.4932614555256065,
2062
+ "grad_norm": 0.6198793053627014,
2063
+ "learning_rate": 0.0005109811117107393,
2064
+ "loss": 3.7118,
2065
+ "step": 13850
2066
+ },
2067
+ {
2068
+ "epoch": 1.4986522911051212,
2069
+ "grad_norm": 0.5693920850753784,
2070
+ "learning_rate": 0.0005106573124662708,
2071
+ "loss": 3.7209,
2072
+ "step": 13900
2073
+ },
2074
+ {
2075
+ "epoch": 1.5040431266846361,
2076
+ "grad_norm": 0.6063724160194397,
2077
+ "learning_rate": 0.0005103335132218025,
2078
+ "loss": 3.7108,
2079
+ "step": 13950
2080
+ },
2081
+ {
2082
+ "epoch": 1.509433962264151,
2083
+ "grad_norm": 0.5341413021087646,
2084
+ "learning_rate": 0.000510009713977334,
2085
+ "loss": 3.7092,
2086
+ "step": 14000
2087
+ },
2088
+ {
2089
+ "epoch": 1.509433962264151,
2090
+ "eval_accuracy": 0.35327356482845174,
2091
+ "eval_loss": 3.66573429107666,
2092
+ "eval_runtime": 144.4021,
2093
+ "eval_samples_per_second": 124.728,
2094
+ "eval_steps_per_second": 7.798,
2095
+ "step": 14000
2096
+ },
2097
+ {
2098
+ "epoch": 1.5148247978436657,
2099
+ "grad_norm": 0.5940130352973938,
2100
+ "learning_rate": 0.0005096859147328656,
2101
+ "loss": 3.7083,
2102
+ "step": 14050
2103
+ },
2104
+ {
2105
+ "epoch": 1.5202156334231804,
2106
+ "grad_norm": 0.5854769945144653,
2107
+ "learning_rate": 0.0005093621154883971,
2108
+ "loss": 3.7072,
2109
+ "step": 14100
2110
+ },
2111
+ {
2112
+ "epoch": 1.5256064690026954,
2113
+ "grad_norm": 0.5369813442230225,
2114
+ "learning_rate": 0.0005090383162439288,
2115
+ "loss": 3.6994,
2116
+ "step": 14150
2117
+ },
2118
+ {
2119
+ "epoch": 1.5309973045822103,
2120
+ "grad_norm": 0.6851678490638733,
2121
+ "learning_rate": 0.0005087145169994602,
2122
+ "loss": 3.7003,
2123
+ "step": 14200
2124
+ },
2125
+ {
2126
+ "epoch": 1.536388140161725,
2127
+ "grad_norm": 0.5451446771621704,
2128
+ "learning_rate": 0.0005083907177549918,
2129
+ "loss": 3.7145,
2130
+ "step": 14250
2131
+ },
2132
+ {
2133
+ "epoch": 1.54177897574124,
2134
+ "grad_norm": 0.550528347492218,
2135
+ "learning_rate": 0.0005080669185105234,
2136
+ "loss": 3.7145,
2137
+ "step": 14300
2138
+ },
2139
+ {
2140
+ "epoch": 1.5471698113207548,
2141
+ "grad_norm": 0.5607097744941711,
2142
+ "learning_rate": 0.000507743119266055,
2143
+ "loss": 3.7074,
2144
+ "step": 14350
2145
+ },
2146
+ {
2147
+ "epoch": 1.5525606469002695,
2148
+ "grad_norm": 0.5699529051780701,
2149
+ "learning_rate": 0.0005074193200215865,
2150
+ "loss": 3.7239,
2151
+ "step": 14400
2152
+ },
2153
+ {
2154
+ "epoch": 1.5579514824797842,
2155
+ "grad_norm": 0.6193976402282715,
2156
+ "learning_rate": 0.0005070955207771181,
2157
+ "loss": 3.7146,
2158
+ "step": 14450
2159
+ },
2160
+ {
2161
+ "epoch": 1.5633423180592994,
2162
+ "grad_norm": 0.5902137756347656,
2163
+ "learning_rate": 0.0005067717215326498,
2164
+ "loss": 3.6944,
2165
+ "step": 14500
2166
+ },
2167
+ {
2168
+ "epoch": 1.568733153638814,
2169
+ "grad_norm": 0.5481423735618591,
2170
+ "learning_rate": 0.0005064479222881813,
2171
+ "loss": 3.7103,
2172
+ "step": 14550
2173
+ },
2174
+ {
2175
+ "epoch": 1.5741239892183287,
2176
+ "grad_norm": 0.6323958039283752,
2177
+ "learning_rate": 0.0005061241230437129,
2178
+ "loss": 3.6955,
2179
+ "step": 14600
2180
+ },
2181
+ {
2182
+ "epoch": 1.5795148247978437,
2183
+ "grad_norm": 0.6062991619110107,
2184
+ "learning_rate": 0.0005058003237992444,
2185
+ "loss": 3.6937,
2186
+ "step": 14650
2187
+ },
2188
+ {
2189
+ "epoch": 1.5849056603773586,
2190
+ "grad_norm": 0.5340829491615295,
2191
+ "learning_rate": 0.0005054830005396654,
2192
+ "loss": 3.7098,
2193
+ "step": 14700
2194
+ },
2195
+ {
2196
+ "epoch": 1.5902964959568733,
2197
+ "grad_norm": 0.590190052986145,
2198
+ "learning_rate": 0.0005051592012951969,
2199
+ "loss": 3.7098,
2200
+ "step": 14750
2201
+ },
2202
+ {
2203
+ "epoch": 1.595687331536388,
2204
+ "grad_norm": 0.5628999471664429,
2205
+ "learning_rate": 0.0005048354020507286,
2206
+ "loss": 3.6799,
2207
+ "step": 14800
2208
+ },
2209
+ {
2210
+ "epoch": 1.6010781671159031,
2211
+ "grad_norm": 0.5697661638259888,
2212
+ "learning_rate": 0.00050451160280626,
2213
+ "loss": 3.7014,
2214
+ "step": 14850
2215
+ },
2216
+ {
2217
+ "epoch": 1.6064690026954178,
2218
+ "grad_norm": 0.5965352058410645,
2219
+ "learning_rate": 0.0005041878035617917,
2220
+ "loss": 3.7081,
2221
+ "step": 14900
2222
+ },
2223
+ {
2224
+ "epoch": 1.6118598382749325,
2225
+ "grad_norm": 0.5691752433776855,
2226
+ "learning_rate": 0.0005038640043173232,
2227
+ "loss": 3.697,
2228
+ "step": 14950
2229
+ },
2230
+ {
2231
+ "epoch": 1.6172506738544474,
2232
+ "grad_norm": 0.5844449400901794,
2233
+ "learning_rate": 0.0005035402050728548,
2234
+ "loss": 3.6968,
2235
+ "step": 15000
2236
+ },
2237
+ {
2238
+ "epoch": 1.6172506738544474,
2239
+ "eval_accuracy": 0.35501950808274263,
2240
+ "eval_loss": 3.6473617553710938,
2241
+ "eval_runtime": 144.9331,
2242
+ "eval_samples_per_second": 124.271,
2243
+ "eval_steps_per_second": 7.769,
2244
+ "step": 15000
2245
+ },
2246
+ {
2247
+ "epoch": 1.6226415094339623,
2248
+ "grad_norm": 0.5243232846260071,
2249
+ "learning_rate": 0.0005032164058283863,
2250
+ "loss": 3.6876,
2251
+ "step": 15050
2252
+ },
2253
+ {
2254
+ "epoch": 1.628032345013477,
2255
+ "grad_norm": 0.521431565284729,
2256
+ "learning_rate": 0.0005028926065839179,
2257
+ "loss": 3.6816,
2258
+ "step": 15100
2259
+ },
2260
+ {
2261
+ "epoch": 1.633423180592992,
2262
+ "grad_norm": 0.54181307554245,
2263
+ "learning_rate": 0.0005025688073394495,
2264
+ "loss": 3.681,
2265
+ "step": 15150
2266
+ },
2267
+ {
2268
+ "epoch": 1.6388140161725069,
2269
+ "grad_norm": 0.5746421217918396,
2270
+ "learning_rate": 0.0005022450080949811,
2271
+ "loss": 3.6847,
2272
+ "step": 15200
2273
+ },
2274
+ {
2275
+ "epoch": 1.6442048517520216,
2276
+ "grad_norm": 0.5637415051460266,
2277
+ "learning_rate": 0.0005019212088505126,
2278
+ "loss": 3.6782,
2279
+ "step": 15250
2280
+ },
2281
+ {
2282
+ "epoch": 1.6495956873315363,
2283
+ "grad_norm": 0.5300856828689575,
2284
+ "learning_rate": 0.0005015974096060442,
2285
+ "loss": 3.6848,
2286
+ "step": 15300
2287
+ },
2288
+ {
2289
+ "epoch": 1.6549865229110512,
2290
+ "grad_norm": 0.5390794277191162,
2291
+ "learning_rate": 0.0005012736103615758,
2292
+ "loss": 3.7087,
2293
+ "step": 15350
2294
+ },
2295
+ {
2296
+ "epoch": 1.6603773584905661,
2297
+ "grad_norm": 0.6010688543319702,
2298
+ "learning_rate": 0.0005009498111171074,
2299
+ "loss": 3.6966,
2300
+ "step": 15400
2301
+ },
2302
+ {
2303
+ "epoch": 1.6657681940700808,
2304
+ "grad_norm": 0.548942506313324,
2305
+ "learning_rate": 0.0005006260118726389,
2306
+ "loss": 3.6648,
2307
+ "step": 15450
2308
+ },
2309
+ {
2310
+ "epoch": 1.6711590296495957,
2311
+ "grad_norm": 0.5794510245323181,
2312
+ "learning_rate": 0.0005003022126281705,
2313
+ "loss": 3.6827,
2314
+ "step": 15500
2315
+ },
2316
+ {
2317
+ "epoch": 1.6765498652291106,
2318
+ "grad_norm": 0.5393055081367493,
2319
+ "learning_rate": 0.000499978413383702,
2320
+ "loss": 3.68,
2321
+ "step": 15550
2322
+ },
2323
+ {
2324
+ "epoch": 1.6819407008086253,
2325
+ "grad_norm": 0.5647615194320679,
2326
+ "learning_rate": 0.0004996546141392336,
2327
+ "loss": 3.6677,
2328
+ "step": 15600
2329
+ },
2330
+ {
2331
+ "epoch": 1.68733153638814,
2332
+ "grad_norm": 0.5918028354644775,
2333
+ "learning_rate": 0.0004993308148947651,
2334
+ "loss": 3.6859,
2335
+ "step": 15650
2336
+ },
2337
+ {
2338
+ "epoch": 1.692722371967655,
2339
+ "grad_norm": 0.5556984543800354,
2340
+ "learning_rate": 0.0004990070156502968,
2341
+ "loss": 3.6733,
2342
+ "step": 15700
2343
+ },
2344
+ {
2345
+ "epoch": 1.6981132075471699,
2346
+ "grad_norm": 0.5747886896133423,
2347
+ "learning_rate": 0.0004986832164058284,
2348
+ "loss": 3.6979,
2349
+ "step": 15750
2350
+ },
2351
+ {
2352
+ "epoch": 1.7035040431266846,
2353
+ "grad_norm": 0.5632966756820679,
2354
+ "learning_rate": 0.0004983594171613599,
2355
+ "loss": 3.6957,
2356
+ "step": 15800
2357
+ },
2358
+ {
2359
+ "epoch": 1.7088948787061995,
2360
+ "grad_norm": 0.5908682942390442,
2361
+ "learning_rate": 0.0004980356179168915,
2362
+ "loss": 3.6886,
2363
+ "step": 15850
2364
+ },
2365
+ {
2366
+ "epoch": 1.7142857142857144,
2367
+ "grad_norm": 0.6491772532463074,
2368
+ "learning_rate": 0.000497711818672423,
2369
+ "loss": 3.6902,
2370
+ "step": 15900
2371
+ },
2372
+ {
2373
+ "epoch": 1.719676549865229,
2374
+ "grad_norm": 0.594153106212616,
2375
+ "learning_rate": 0.0004973880194279547,
2376
+ "loss": 3.6752,
2377
+ "step": 15950
2378
+ },
2379
+ {
2380
+ "epoch": 1.7250673854447438,
2381
+ "grad_norm": 0.6420136094093323,
2382
+ "learning_rate": 0.0004970642201834862,
2383
+ "loss": 3.6724,
2384
+ "step": 16000
2385
+ },
2386
+ {
2387
+ "epoch": 1.7250673854447438,
2388
+ "eval_accuracy": 0.3570638121669721,
2389
+ "eval_loss": 3.6263442039489746,
2390
+ "eval_runtime": 144.5518,
2391
+ "eval_samples_per_second": 124.599,
2392
+ "eval_steps_per_second": 7.79,
2393
+ "step": 16000
2394
+ },
2395
+ {
2396
+ "epoch": 1.7304582210242587,
2397
+ "grad_norm": 0.5980614423751831,
2398
+ "learning_rate": 0.0004967404209390178,
2399
+ "loss": 3.6727,
2400
+ "step": 16050
2401
+ },
2402
+ {
2403
+ "epoch": 1.7358490566037736,
2404
+ "grad_norm": 0.5954625010490417,
2405
+ "learning_rate": 0.0004964166216945493,
2406
+ "loss": 3.6587,
2407
+ "step": 16100
2408
+ },
2409
+ {
2410
+ "epoch": 1.7412398921832883,
2411
+ "grad_norm": 0.563701868057251,
2412
+ "learning_rate": 0.000496092822450081,
2413
+ "loss": 3.683,
2414
+ "step": 16150
2415
+ },
2416
+ {
2417
+ "epoch": 1.7466307277628033,
2418
+ "grad_norm": 0.5353266596794128,
2419
+ "learning_rate": 0.0004957690232056125,
2420
+ "loss": 3.6928,
2421
+ "step": 16200
2422
+ },
2423
+ {
2424
+ "epoch": 1.7520215633423182,
2425
+ "grad_norm": 0.6115831136703491,
2426
+ "learning_rate": 0.0004954452239611441,
2427
+ "loss": 3.6697,
2428
+ "step": 16250
2429
+ },
2430
+ {
2431
+ "epoch": 1.7574123989218329,
2432
+ "grad_norm": 0.5373592376708984,
2433
+ "learning_rate": 0.0004951214247166756,
2434
+ "loss": 3.6907,
2435
+ "step": 16300
2436
+ },
2437
+ {
2438
+ "epoch": 1.7628032345013476,
2439
+ "grad_norm": 0.7120974063873291,
2440
+ "learning_rate": 0.0004947976254722072,
2441
+ "loss": 3.6741,
2442
+ "step": 16350
2443
+ },
2444
+ {
2445
+ "epoch": 1.7681940700808625,
2446
+ "grad_norm": 0.5365995764732361,
2447
+ "learning_rate": 0.0004944738262277387,
2448
+ "loss": 3.6656,
2449
+ "step": 16400
2450
+ },
2451
+ {
2452
+ "epoch": 1.7735849056603774,
2453
+ "grad_norm": 0.6026771664619446,
2454
+ "learning_rate": 0.0004941500269832703,
2455
+ "loss": 3.6708,
2456
+ "step": 16450
2457
+ },
2458
+ {
2459
+ "epoch": 1.778975741239892,
2460
+ "grad_norm": 0.5560598969459534,
2461
+ "learning_rate": 0.0004938262277388019,
2462
+ "loss": 3.6594,
2463
+ "step": 16500
2464
+ },
2465
+ {
2466
+ "epoch": 1.784366576819407,
2467
+ "grad_norm": 0.6191685199737549,
2468
+ "learning_rate": 0.0004935024284943335,
2469
+ "loss": 3.685,
2470
+ "step": 16550
2471
+ },
2472
+ {
2473
+ "epoch": 1.789757412398922,
2474
+ "grad_norm": 0.6867356896400452,
2475
+ "learning_rate": 0.000493178629249865,
2476
+ "loss": 3.6698,
2477
+ "step": 16600
2478
+ },
2479
+ {
2480
+ "epoch": 1.7951482479784366,
2481
+ "grad_norm": 0.5517817139625549,
2482
+ "learning_rate": 0.0004928548300053966,
2483
+ "loss": 3.6725,
2484
+ "step": 16650
2485
+ },
2486
+ {
2487
+ "epoch": 1.8005390835579513,
2488
+ "grad_norm": 0.5496578216552734,
2489
+ "learning_rate": 0.0004925310307609282,
2490
+ "loss": 3.6745,
2491
+ "step": 16700
2492
+ },
2493
+ {
2494
+ "epoch": 1.8059299191374663,
2495
+ "grad_norm": 0.540912926197052,
2496
+ "learning_rate": 0.0004922072315164598,
2497
+ "loss": 3.6624,
2498
+ "step": 16750
2499
+ },
2500
+ {
2501
+ "epoch": 1.8113207547169812,
2502
+ "grad_norm": 0.5637075304985046,
2503
+ "learning_rate": 0.0004918834322719913,
2504
+ "loss": 3.6658,
2505
+ "step": 16800
2506
+ },
2507
+ {
2508
+ "epoch": 1.8167115902964959,
2509
+ "grad_norm": 0.5674017071723938,
2510
+ "learning_rate": 0.0004915596330275229,
2511
+ "loss": 3.6701,
2512
+ "step": 16850
2513
+ },
2514
+ {
2515
+ "epoch": 1.8221024258760108,
2516
+ "grad_norm": 0.5619826912879944,
2517
+ "learning_rate": 0.0004912487857528331,
2518
+ "loss": 3.6714,
2519
+ "step": 16900
2520
+ },
2521
+ {
2522
+ "epoch": 1.8274932614555257,
2523
+ "grad_norm": 0.5217414498329163,
2524
+ "learning_rate": 0.0004909249865083648,
2525
+ "loss": 3.6863,
2526
+ "step": 16950
2527
+ },
2528
+ {
2529
+ "epoch": 1.8328840970350404,
2530
+ "grad_norm": 0.565362811088562,
2531
+ "learning_rate": 0.0004906011872638964,
2532
+ "loss": 3.6737,
2533
+ "step": 17000
2534
+ },
2535
+ {
2536
+ "epoch": 1.8328840970350404,
2537
+ "eval_accuracy": 0.3582435652228617,
2538
+ "eval_loss": 3.6162045001983643,
2539
+ "eval_runtime": 144.7573,
2540
+ "eval_samples_per_second": 124.422,
2541
+ "eval_steps_per_second": 7.779,
2542
+ "step": 17000
2543
+ },
2544
+ {
2545
+ "epoch": 1.838274932614555,
2546
+ "grad_norm": 0.5198625326156616,
2547
+ "learning_rate": 0.0004902773880194279,
2548
+ "loss": 3.6515,
2549
+ "step": 17050
2550
+ },
2551
+ {
2552
+ "epoch": 1.8436657681940702,
2553
+ "grad_norm": 0.5708647966384888,
2554
+ "learning_rate": 0.0004899535887749595,
2555
+ "loss": 3.6661,
2556
+ "step": 17100
2557
+ },
2558
+ {
2559
+ "epoch": 1.849056603773585,
2560
+ "grad_norm": 0.583521842956543,
2561
+ "learning_rate": 0.000489629789530491,
2562
+ "loss": 3.6622,
2563
+ "step": 17150
2564
+ },
2565
+ {
2566
+ "epoch": 1.8544474393530996,
2567
+ "grad_norm": 0.5446178913116455,
2568
+ "learning_rate": 0.0004893059902860227,
2569
+ "loss": 3.647,
2570
+ "step": 17200
2571
+ },
2572
+ {
2573
+ "epoch": 1.8598382749326146,
2574
+ "grad_norm": 0.5899081230163574,
2575
+ "learning_rate": 0.0004889821910415542,
2576
+ "loss": 3.6697,
2577
+ "step": 17250
2578
+ },
2579
+ {
2580
+ "epoch": 1.8652291105121295,
2581
+ "grad_norm": 0.6142265796661377,
2582
+ "learning_rate": 0.0004886583917970858,
2583
+ "loss": 3.6597,
2584
+ "step": 17300
2585
+ },
2586
+ {
2587
+ "epoch": 1.8706199460916442,
2588
+ "grad_norm": 0.9669179320335388,
2589
+ "learning_rate": 0.0004883345925526173,
2590
+ "loss": 3.6618,
2591
+ "step": 17350
2592
+ },
2593
+ {
2594
+ "epoch": 1.8760107816711589,
2595
+ "grad_norm": 0.564871072769165,
2596
+ "learning_rate": 0.00048801079330814887,
2597
+ "loss": 3.6571,
2598
+ "step": 17400
2599
+ },
2600
+ {
2601
+ "epoch": 1.881401617250674,
2602
+ "grad_norm": 0.5841614007949829,
2603
+ "learning_rate": 0.00048768699406368047,
2604
+ "loss": 3.6743,
2605
+ "step": 17450
2606
+ },
2607
+ {
2608
+ "epoch": 1.8867924528301887,
2609
+ "grad_norm": 0.5918266773223877,
2610
+ "learning_rate": 0.000487363194819212,
2611
+ "loss": 3.6675,
2612
+ "step": 17500
2613
+ },
2614
+ {
2615
+ "epoch": 1.8921832884097034,
2616
+ "grad_norm": 0.5641137361526489,
2617
+ "learning_rate": 0.0004870393955747436,
2618
+ "loss": 3.6488,
2619
+ "step": 17550
2620
+ },
2621
+ {
2622
+ "epoch": 1.8975741239892183,
2623
+ "grad_norm": 0.5747765898704529,
2624
+ "learning_rate": 0.0004867155963302752,
2625
+ "loss": 3.6558,
2626
+ "step": 17600
2627
+ },
2628
+ {
2629
+ "epoch": 1.9029649595687332,
2630
+ "grad_norm": 0.5801583528518677,
2631
+ "learning_rate": 0.0004863917970858068,
2632
+ "loss": 3.6348,
2633
+ "step": 17650
2634
+ },
2635
+ {
2636
+ "epoch": 1.908355795148248,
2637
+ "grad_norm": 0.5788767337799072,
2638
+ "learning_rate": 0.00048606799784133833,
2639
+ "loss": 3.6446,
2640
+ "step": 17700
2641
+ },
2642
+ {
2643
+ "epoch": 1.9137466307277629,
2644
+ "grad_norm": 0.5854235291481018,
2645
+ "learning_rate": 0.00048574419859686994,
2646
+ "loss": 3.648,
2647
+ "step": 17750
2648
+ },
2649
+ {
2650
+ "epoch": 1.9191374663072778,
2651
+ "grad_norm": 0.5577847957611084,
2652
+ "learning_rate": 0.0004854203993524015,
2653
+ "loss": 3.6665,
2654
+ "step": 17800
2655
+ },
2656
+ {
2657
+ "epoch": 1.9245283018867925,
2658
+ "grad_norm": 0.536088228225708,
2659
+ "learning_rate": 0.000485096600107933,
2660
+ "loss": 3.6671,
2661
+ "step": 17850
2662
+ },
2663
+ {
2664
+ "epoch": 1.9299191374663072,
2665
+ "grad_norm": 0.5481696724891663,
2666
+ "learning_rate": 0.00048477280086346464,
2667
+ "loss": 3.6721,
2668
+ "step": 17900
2669
+ },
2670
+ {
2671
+ "epoch": 1.935309973045822,
2672
+ "grad_norm": 0.547725260257721,
2673
+ "learning_rate": 0.00048444900161899614,
2674
+ "loss": 3.6622,
2675
+ "step": 17950
2676
+ },
2677
+ {
2678
+ "epoch": 1.940700808625337,
2679
+ "grad_norm": 0.5937644243240356,
2680
+ "learning_rate": 0.00048412520237452774,
2681
+ "loss": 3.6477,
2682
+ "step": 18000
2683
+ },
2684
+ {
2685
+ "epoch": 1.940700808625337,
2686
+ "eval_accuracy": 0.3597537317105952,
2687
+ "eval_loss": 3.5971779823303223,
2688
+ "eval_runtime": 144.7293,
2689
+ "eval_samples_per_second": 124.446,
2690
+ "eval_steps_per_second": 7.78,
2691
+ "step": 18000
2692
+ },
2693
+ {
2694
+ "epoch": 1.9460916442048517,
2695
+ "grad_norm": 0.5931717753410339,
2696
+ "learning_rate": 0.0004838014031300593,
2697
+ "loss": 3.6358,
2698
+ "step": 18050
2699
+ },
2700
+ {
2701
+ "epoch": 1.9514824797843666,
2702
+ "grad_norm": 0.6099575757980347,
2703
+ "learning_rate": 0.0004834776038855909,
2704
+ "loss": 3.644,
2705
+ "step": 18100
2706
+ },
2707
+ {
2708
+ "epoch": 1.9568733153638815,
2709
+ "grad_norm": 0.5704085230827332,
2710
+ "learning_rate": 0.00048315380464112245,
2711
+ "loss": 3.6517,
2712
+ "step": 18150
2713
+ },
2714
+ {
2715
+ "epoch": 1.9622641509433962,
2716
+ "grad_norm": 0.5262129902839661,
2717
+ "learning_rate": 0.00048283000539665405,
2718
+ "loss": 3.6523,
2719
+ "step": 18200
2720
+ },
2721
+ {
2722
+ "epoch": 1.967654986522911,
2723
+ "grad_norm": 0.512693464756012,
2724
+ "learning_rate": 0.0004825062061521856,
2725
+ "loss": 3.6432,
2726
+ "step": 18250
2727
+ },
2728
+ {
2729
+ "epoch": 1.9730458221024259,
2730
+ "grad_norm": 0.5602070093154907,
2731
+ "learning_rate": 0.00048218240690771716,
2732
+ "loss": 3.6688,
2733
+ "step": 18300
2734
+ },
2735
+ {
2736
+ "epoch": 1.9784366576819408,
2737
+ "grad_norm": 0.5258756875991821,
2738
+ "learning_rate": 0.00048185860766324876,
2739
+ "loss": 3.6471,
2740
+ "step": 18350
2741
+ },
2742
+ {
2743
+ "epoch": 1.9838274932614555,
2744
+ "grad_norm": 0.5815879106521606,
2745
+ "learning_rate": 0.0004815348084187803,
2746
+ "loss": 3.6496,
2747
+ "step": 18400
2748
+ },
2749
+ {
2750
+ "epoch": 1.9892183288409704,
2751
+ "grad_norm": 0.5756340622901917,
2752
+ "learning_rate": 0.0004812110091743119,
2753
+ "loss": 3.6589,
2754
+ "step": 18450
2755
+ },
2756
+ {
2757
+ "epoch": 1.9946091644204853,
2758
+ "grad_norm": 0.6157427430152893,
2759
+ "learning_rate": 0.00048088720992984347,
2760
+ "loss": 3.6378,
2761
+ "step": 18500
2762
+ },
2763
+ {
2764
+ "epoch": 2.0,
2765
+ "grad_norm": 1.1887223720550537,
2766
+ "learning_rate": 0.00048056341068537507,
2767
+ "loss": 3.6418,
2768
+ "step": 18550
2769
+ },
2770
+ {
2771
+ "epoch": 2.0053908355795147,
2772
+ "grad_norm": 0.5796721577644348,
2773
+ "learning_rate": 0.00048023961144090657,
2774
+ "loss": 3.5398,
2775
+ "step": 18600
2776
+ },
2777
+ {
2778
+ "epoch": 2.01078167115903,
2779
+ "grad_norm": 0.5702629089355469,
2780
+ "learning_rate": 0.00047991581219643817,
2781
+ "loss": 3.5522,
2782
+ "step": 18650
2783
+ },
2784
+ {
2785
+ "epoch": 2.0161725067385445,
2786
+ "grad_norm": 0.5687718391418457,
2787
+ "learning_rate": 0.0004795920129519697,
2788
+ "loss": 3.5469,
2789
+ "step": 18700
2790
+ },
2791
+ {
2792
+ "epoch": 2.0215633423180592,
2793
+ "grad_norm": 0.6058363914489746,
2794
+ "learning_rate": 0.0004792682137075013,
2795
+ "loss": 3.5565,
2796
+ "step": 18750
2797
+ },
2798
+ {
2799
+ "epoch": 2.026954177897574,
2800
+ "grad_norm": 0.5502753257751465,
2801
+ "learning_rate": 0.0004789444144630329,
2802
+ "loss": 3.5781,
2803
+ "step": 18800
2804
+ },
2805
+ {
2806
+ "epoch": 2.032345013477089,
2807
+ "grad_norm": 0.5815129280090332,
2808
+ "learning_rate": 0.00047862061521856443,
2809
+ "loss": 3.5754,
2810
+ "step": 18850
2811
+ },
2812
+ {
2813
+ "epoch": 2.0377358490566038,
2814
+ "grad_norm": 0.5740176439285278,
2815
+ "learning_rate": 0.00047829681597409603,
2816
+ "loss": 3.5584,
2817
+ "step": 18900
2818
+ },
2819
+ {
2820
+ "epoch": 2.0431266846361185,
2821
+ "grad_norm": 0.5666323304176331,
2822
+ "learning_rate": 0.0004779730167296276,
2823
+ "loss": 3.5638,
2824
+ "step": 18950
2825
+ },
2826
+ {
2827
+ "epoch": 2.0485175202156336,
2828
+ "grad_norm": 0.5930228233337402,
2829
+ "learning_rate": 0.0004776492174851592,
2830
+ "loss": 3.5922,
2831
+ "step": 19000
2832
+ },
2833
+ {
2834
+ "epoch": 2.0485175202156336,
2835
+ "eval_accuracy": 0.36108874974317173,
2836
+ "eval_loss": 3.588949203491211,
2837
+ "eval_runtime": 144.8607,
2838
+ "eval_samples_per_second": 124.333,
2839
+ "eval_steps_per_second": 7.773,
2840
+ "step": 19000
2841
+ },
2842
+ {
2843
+ "epoch": 2.0539083557951483,
2844
+ "grad_norm": 0.5626940727233887,
2845
+ "learning_rate": 0.00047732541824069074,
2846
+ "loss": 3.5695,
2847
+ "step": 19050
2848
+ },
2849
+ {
2850
+ "epoch": 2.059299191374663,
2851
+ "grad_norm": 0.6210435628890991,
2852
+ "learning_rate": 0.0004770016189962223,
2853
+ "loss": 3.5607,
2854
+ "step": 19100
2855
+ },
2856
+ {
2857
+ "epoch": 2.0646900269541777,
2858
+ "grad_norm": 0.5758371949195862,
2859
+ "learning_rate": 0.0004766778197517539,
2860
+ "loss": 3.5751,
2861
+ "step": 19150
2862
+ },
2863
+ {
2864
+ "epoch": 2.070080862533693,
2865
+ "grad_norm": 0.6037834286689758,
2866
+ "learning_rate": 0.0004763540205072854,
2867
+ "loss": 3.5875,
2868
+ "step": 19200
2869
+ },
2870
+ {
2871
+ "epoch": 2.0754716981132075,
2872
+ "grad_norm": 0.5686853528022766,
2873
+ "learning_rate": 0.00047603022126281705,
2874
+ "loss": 3.5713,
2875
+ "step": 19250
2876
+ },
2877
+ {
2878
+ "epoch": 2.0808625336927222,
2879
+ "grad_norm": 0.6485291123390198,
2880
+ "learning_rate": 0.00047570642201834855,
2881
+ "loss": 3.5609,
2882
+ "step": 19300
2883
+ },
2884
+ {
2885
+ "epoch": 2.0862533692722374,
2886
+ "grad_norm": 0.6449443697929382,
2887
+ "learning_rate": 0.00047538262277388015,
2888
+ "loss": 3.5789,
2889
+ "step": 19350
2890
+ },
2891
+ {
2892
+ "epoch": 2.091644204851752,
2893
+ "grad_norm": 0.550469696521759,
2894
+ "learning_rate": 0.0004750588235294117,
2895
+ "loss": 3.554,
2896
+ "step": 19400
2897
+ },
2898
+ {
2899
+ "epoch": 2.0970350404312668,
2900
+ "grad_norm": 0.5864623785018921,
2901
+ "learning_rate": 0.0004747350242849433,
2902
+ "loss": 3.5732,
2903
+ "step": 19450
2904
+ },
2905
+ {
2906
+ "epoch": 2.1024258760107815,
2907
+ "grad_norm": 0.6241238117218018,
2908
+ "learning_rate": 0.00047441122504047486,
2909
+ "loss": 3.5717,
2910
+ "step": 19500
2911
+ },
2912
+ {
2913
+ "epoch": 2.1078167115902966,
2914
+ "grad_norm": 0.602936327457428,
2915
+ "learning_rate": 0.0004740874257960064,
2916
+ "loss": 3.5604,
2917
+ "step": 19550
2918
+ },
2919
+ {
2920
+ "epoch": 2.1132075471698113,
2921
+ "grad_norm": 0.5674468874931335,
2922
+ "learning_rate": 0.000473763626551538,
2923
+ "loss": 3.5643,
2924
+ "step": 19600
2925
+ },
2926
+ {
2927
+ "epoch": 2.118598382749326,
2928
+ "grad_norm": 0.600834310054779,
2929
+ "learning_rate": 0.00047343982730706956,
2930
+ "loss": 3.5695,
2931
+ "step": 19650
2932
+ },
2933
+ {
2934
+ "epoch": 2.123989218328841,
2935
+ "grad_norm": 0.5601726770401001,
2936
+ "learning_rate": 0.00047311602806260117,
2937
+ "loss": 3.5724,
2938
+ "step": 19700
2939
+ },
2940
+ {
2941
+ "epoch": 2.129380053908356,
2942
+ "grad_norm": 0.5551213622093201,
2943
+ "learning_rate": 0.0004727922288181327,
2944
+ "loss": 3.5783,
2945
+ "step": 19750
2946
+ },
2947
+ {
2948
+ "epoch": 2.1347708894878705,
2949
+ "grad_norm": 0.5737272500991821,
2950
+ "learning_rate": 0.0004724684295736643,
2951
+ "loss": 3.5642,
2952
+ "step": 19800
2953
+ },
2954
+ {
2955
+ "epoch": 2.1401617250673857,
2956
+ "grad_norm": 0.5712146162986755,
2957
+ "learning_rate": 0.0004721446303291959,
2958
+ "loss": 3.5894,
2959
+ "step": 19850
2960
+ },
2961
+ {
2962
+ "epoch": 2.1455525606469004,
2963
+ "grad_norm": 0.5902134776115417,
2964
+ "learning_rate": 0.0004718208310847275,
2965
+ "loss": 3.5903,
2966
+ "step": 19900
2967
+ },
2968
+ {
2969
+ "epoch": 2.150943396226415,
2970
+ "grad_norm": 0.6139191389083862,
2971
+ "learning_rate": 0.000471497031840259,
2972
+ "loss": 3.5862,
2973
+ "step": 19950
2974
+ },
2975
+ {
2976
+ "epoch": 2.1563342318059298,
2977
+ "grad_norm": 0.5515206456184387,
2978
+ "learning_rate": 0.0004711732325957905,
2979
+ "loss": 3.5706,
2980
+ "step": 20000
2981
+ },
2982
+ {
2983
+ "epoch": 2.1563342318059298,
2984
+ "eval_accuracy": 0.3621358376230399,
2985
+ "eval_loss": 3.5808968544006348,
2986
+ "eval_runtime": 144.61,
2987
+ "eval_samples_per_second": 124.549,
2988
+ "eval_steps_per_second": 7.786,
2989
+ "step": 20000
2990
+ }
2991
+ ],
2992
+ "logging_steps": 50,
2993
+ "max_steps": 92750,
2994
+ "num_input_tokens_seen": 0,
2995
+ "num_train_epochs": 10,
2996
+ "save_steps": 10000,
2997
+ "stateful_callbacks": {
2998
+ "TrainerControl": {
2999
+ "args": {
3000
+ "should_epoch_stop": false,
3001
+ "should_evaluate": false,
3002
+ "should_log": false,
3003
+ "should_save": true,
3004
+ "should_training_stop": false
3005
+ },
3006
+ "attributes": {}
3007
+ }
3008
+ },
3009
+ "total_flos": 1.672138358784e+17,
3010
+ "train_batch_size": 32,
3011
+ "trial_name": null,
3012
+ "trial_params": null
3013
+ }
checkpoint-20000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f163471d892c378f64db98f5c5595dd2d777d1501861716698287d58ed0c89
3
+ size 5304