AlexWortega commited on
Commit
a767f08
·
verified ·
1 Parent(s): 8200fbb

Upload checkpoint-10000

Browse files
.gitattributes CHANGED
@@ -51,3 +51,21 @@ checkpoint-30000/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs
51
  checkpoint-30000/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
52
  checkpoint-30000/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
53
  checkpoint-30000/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  checkpoint-30000/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
52
  checkpoint-30000/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
53
  checkpoint-30000/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
54
+ optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
55
+ optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
56
+ optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
57
+ optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
58
+ optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
59
+ optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
60
+ optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
61
+ optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
62
+ optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
63
+ pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text
64
+ pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
65
+ pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
66
+ pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
67
+ pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
68
+ pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
69
+ pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
70
+ pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
71
+ pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b6e4fef1449e764e63617312ff36b134b79f477317beb171b0740c69c31436f
3
+ size 1577571
optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0226fde4b561d4015ac6313a277611418c823493c39849a88ba84633d06268e
3
+ size 4042716233
optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7117efae903d19712e435aa19fffaa0cbead5338e6d9f4fa5b4149b9ed4d76
3
+ size 4042809589
optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fcf70a3afc0eecc5a3c422cdf9a9253524c9150623838f66f330add51900465
3
+ size 4042809589
optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2ba060b65d30766479cd46064cb947192acc911f14dc1558b25b131d7714cde
3
+ size 4042809589
optimizer_0/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a0b3b750691b27820671ddc2ab53384d685d626d091863097fc52cab6505018
3
+ size 4042809589
optimizer_0/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c726ccb1705178ea6c622e4e6dd6586559904e81f98ae1a0f384adf5e2da1a
3
+ size 4042817013
optimizer_0/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9d6ca7e9093e07dae5173cb196b143fe8300d0f9c2b19b472ecd16287d3b152
3
+ size 4042820552
optimizer_0/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9305f9819bde82082631043f69a533f745ce1d52538a7be0a70a05c0f9b5425
3
+ size 4042787848
pytorch_model_fsdp_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d89018facf69483ec8a883be1c49b14d7509b34fab093f559511e3918103735a
3
+ size 1378806
pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed809cb64b44279cb3633fb989bab302965db59f28c9b6070e05e81f31ea371c
3
+ size 2534754104
pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48162aa26ea9cdd599615cda3a11128217371ce9b6ad6101f478114e6ccb6bff
3
+ size 2534754104
pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d8c64fc91f736a3b33dcc19aff66190847210255eae743938de4049df2fc126
3
+ size 2534754104
pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dc059c8e3b1e01ce76afc00f4db3b41b3cdc849838de5b35746fca8bddcb3f3
3
+ size 2534754104
pytorch_model_fsdp_0/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc80c4bfc63622c506486115b4c8bb14a5e289813fa6ef90a3a44b60dd345cc4
3
+ size 2534754104
pytorch_model_fsdp_0/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb6aa8e98b556a36ce9899d65e1698e91a04474222ea454c4f8066f4584bfb2d
3
+ size 2534754104
pytorch_model_fsdp_0/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c61fe1c477ab7bd3929b938aef60408f5e06d35ab6d62413ba69e11a1364f99
3
+ size 2534754104
pytorch_model_fsdp_0/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d667d11364675a52a6728444ebc600e09e94734cc6cb6204bde60b104b8c13a8
3
+ size 2534713144
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:826997d456e99c919bad937116b8aa018624d3cee7e87a68e844e70bd80aeb45
3
+ size 16389
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91ed4ea9de6889ae5075a8093b9c23505ab8c5554ee18ff06fea428794e1db0c
3
+ size 16389
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a61500ed6f4ed1f30ad9f7e360264248052865b80b29cbdbe42cea3fcfdc3934
3
+ size 16389
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52136e6310f738523ce8b1da6f4ca37ccc5b3faa16f96d90873ade48d6d36b0a
3
+ size 16389
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2e4694a733324ad8470e895f12f022f928d19a23a5a2eacad0a568da191db37
3
+ size 16389
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ba38f72004469e5394586da18f61e173aa969dfa837593fc8e5b8247877b4f4
3
+ size 16389
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6312012a936a01fbcbe06ee9ff87f68144ff51959bf530f08708c13ec6d49bc8
3
+ size 16389
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51b2169702236d42a74f3bb15eff84b84fce6da260a8c333c66459901c75a30c
3
+ size 16389
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a5a7b2dad441e5e2dccf21c3129e93bb2c435799b9325dae010bd3fda0c38e8
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,1434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.331707964308223,
6
+ "eval_steps": 15000000,
7
+ "global_step": 10000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0016585398215411152,
14
+ "grad_norm": 5.2646002769470215,
15
+ "learning_rate": 1.9503781345362876e-06,
16
+ "loss": 3.606,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.0033170796430822304,
21
+ "grad_norm": 11.763270378112793,
22
+ "learning_rate": 3.940559904471274e-06,
23
+ "loss": 2.6238,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.004975619464623346,
28
+ "grad_norm": 6.007935047149658,
29
+ "learning_rate": 5.930741674406262e-06,
30
+ "loss": 2.5644,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.006634159286164461,
35
+ "grad_norm": 13.862035751342773,
36
+ "learning_rate": 7.920923444341248e-06,
37
+ "loss": 2.4601,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.008292699107705576,
42
+ "grad_norm": 6.1305341720581055,
43
+ "learning_rate": 9.911105214276236e-06,
44
+ "loss": 2.4099,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.009951238929246691,
49
+ "grad_norm": 9.804327964782715,
50
+ "learning_rate": 1.1901286984211225e-05,
51
+ "loss": 2.3409,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.011609778750787806,
56
+ "grad_norm": 6.416324138641357,
57
+ "learning_rate": 1.3891468754146211e-05,
58
+ "loss": 2.2951,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.013268318572328922,
63
+ "grad_norm": 9.413233757019043,
64
+ "learning_rate": 1.5881650524081196e-05,
65
+ "loss": 2.0864,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.014926858393870037,
70
+ "grad_norm": 4.721926212310791,
71
+ "learning_rate": 1.7871832294016186e-05,
72
+ "loss": 0.7346,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.016585398215411152,
77
+ "grad_norm": 3.284379720687866,
78
+ "learning_rate": 1.9862014063951173e-05,
79
+ "loss": 0.4516,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.01824393803695227,
84
+ "grad_norm": 2.671109199523926,
85
+ "learning_rate": 2.185219583388616e-05,
86
+ "loss": 0.3903,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.019902477858493382,
91
+ "grad_norm": 2.644299268722534,
92
+ "learning_rate": 2.384237760382115e-05,
93
+ "loss": 0.3512,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.0215610176800345,
98
+ "grad_norm": 2.1769394874572754,
99
+ "learning_rate": 2.5832559373756132e-05,
100
+ "loss": 0.3298,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.023219557501575613,
105
+ "grad_norm": 2.0835165977478027,
106
+ "learning_rate": 2.7822741143691122e-05,
107
+ "loss": 0.3162,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.02487809732311673,
112
+ "grad_norm": 2.069329023361206,
113
+ "learning_rate": 2.981292291362611e-05,
114
+ "loss": 0.3185,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.026536637144657843,
119
+ "grad_norm": 1.8482165336608887,
120
+ "learning_rate": 3.1803104683561096e-05,
121
+ "loss": 0.3057,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.02819517696619896,
126
+ "grad_norm": 1.8083229064941406,
127
+ "learning_rate": 3.379328645349608e-05,
128
+ "loss": 0.2857,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.029853716787740073,
133
+ "grad_norm": 1.7274185419082642,
134
+ "learning_rate": 3.578346822343107e-05,
135
+ "loss": 0.3166,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.03151225660928119,
140
+ "grad_norm": 1.6723179817199707,
141
+ "learning_rate": 3.777364999336606e-05,
142
+ "loss": 0.2981,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.033170796430822304,
147
+ "grad_norm": 1.9198503494262695,
148
+ "learning_rate": 3.976383176330104e-05,
149
+ "loss": 0.2818,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.03482933625236342,
154
+ "grad_norm": 1.680993914604187,
155
+ "learning_rate": 4.1754013533236035e-05,
156
+ "loss": 0.2909,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 0.03648787607390454,
161
+ "grad_norm": 1.6936819553375244,
162
+ "learning_rate": 4.374419530317102e-05,
163
+ "loss": 0.2871,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 0.03814641589544565,
168
+ "grad_norm": 1.460769534111023,
169
+ "learning_rate": 4.5734377073106e-05,
170
+ "loss": 0.2734,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 0.039804955716986765,
175
+ "grad_norm": 1.5563299655914307,
176
+ "learning_rate": 4.7724558843040995e-05,
177
+ "loss": 0.2883,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 0.04146349553852788,
182
+ "grad_norm": 1.4941402673721313,
183
+ "learning_rate": 4.971474061297598e-05,
184
+ "loss": 0.2858,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 0.043122035360069,
189
+ "grad_norm": 1.2444454431533813,
190
+ "learning_rate": 5.170492238291097e-05,
191
+ "loss": 0.2626,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 0.04478057518161011,
196
+ "grad_norm": 1.2899850606918335,
197
+ "learning_rate": 5.3695104152845955e-05,
198
+ "loss": 0.29,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 0.046439115003151225,
203
+ "grad_norm": 1.6327451467514038,
204
+ "learning_rate": 5.568528592278094e-05,
205
+ "loss": 0.2897,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 0.04809765482469234,
210
+ "grad_norm": 1.2772693634033203,
211
+ "learning_rate": 5.767546769271593e-05,
212
+ "loss": 0.2926,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 0.04975619464623346,
217
+ "grad_norm": 1.2572505474090576,
218
+ "learning_rate": 5.966564946265092e-05,
219
+ "loss": 0.2816,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 0.05141473446777457,
224
+ "grad_norm": 1.1974629163742065,
225
+ "learning_rate": 6.16558312325859e-05,
226
+ "loss": 0.2772,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 0.053073274289315686,
231
+ "grad_norm": 1.1929668188095093,
232
+ "learning_rate": 6.364601300252089e-05,
233
+ "loss": 0.289,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 0.0547318141108568,
238
+ "grad_norm": 1.2904795408248901,
239
+ "learning_rate": 6.563619477245587e-05,
240
+ "loss": 0.2792,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 0.05639035393239792,
245
+ "grad_norm": 1.169552206993103,
246
+ "learning_rate": 6.762637654239086e-05,
247
+ "loss": 0.2814,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 0.05804889375393903,
252
+ "grad_norm": 1.1359589099884033,
253
+ "learning_rate": 6.961655831232586e-05,
254
+ "loss": 0.2936,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 0.05970743357548015,
259
+ "grad_norm": 1.2144523859024048,
260
+ "learning_rate": 7.160674008226085e-05,
261
+ "loss": 0.272,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 0.06136597339702126,
266
+ "grad_norm": 1.1847233772277832,
267
+ "learning_rate": 7.359692185219582e-05,
268
+ "loss": 0.2635,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 0.06302451321856238,
273
+ "grad_norm": 1.231399655342102,
274
+ "learning_rate": 7.558710362213082e-05,
275
+ "loss": 0.2772,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 0.0646830530401035,
280
+ "grad_norm": 1.0221542119979858,
281
+ "learning_rate": 7.757728539206581e-05,
282
+ "loss": 0.2694,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 0.06634159286164461,
287
+ "grad_norm": 1.091098427772522,
288
+ "learning_rate": 7.956746716200078e-05,
289
+ "loss": 0.289,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 0.06800013268318572,
294
+ "grad_norm": 1.1065690517425537,
295
+ "learning_rate": 8.155764893193577e-05,
296
+ "loss": 0.2974,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 0.06965867250472683,
301
+ "grad_norm": 0.9319403767585754,
302
+ "learning_rate": 8.354783070187077e-05,
303
+ "loss": 0.2819,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 0.07131721232626795,
308
+ "grad_norm": 1.1582205295562744,
309
+ "learning_rate": 8.553801247180575e-05,
310
+ "loss": 0.276,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 0.07297575214780908,
315
+ "grad_norm": 0.9972165822982788,
316
+ "learning_rate": 8.752819424174074e-05,
317
+ "loss": 0.2814,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 0.07463429196935019,
322
+ "grad_norm": 1.2905242443084717,
323
+ "learning_rate": 8.951837601167573e-05,
324
+ "loss": 0.2688,
325
+ "step": 2250
326
+ },
327
+ {
328
+ "epoch": 0.0762928317908913,
329
+ "grad_norm": 0.944217324256897,
330
+ "learning_rate": 9.150855778161071e-05,
331
+ "loss": 0.2786,
332
+ "step": 2300
333
+ },
334
+ {
335
+ "epoch": 0.07795137161243242,
336
+ "grad_norm": 0.9599469900131226,
337
+ "learning_rate": 9.349873955154569e-05,
338
+ "loss": 0.2677,
339
+ "step": 2350
340
+ },
341
+ {
342
+ "epoch": 0.07960991143397353,
343
+ "grad_norm": 1.0232480764389038,
344
+ "learning_rate": 9.54889213214807e-05,
345
+ "loss": 0.2751,
346
+ "step": 2400
347
+ },
348
+ {
349
+ "epoch": 0.08126845125551464,
350
+ "grad_norm": 0.8944919109344482,
351
+ "learning_rate": 9.747910309141567e-05,
352
+ "loss": 0.2883,
353
+ "step": 2450
354
+ },
355
+ {
356
+ "epoch": 0.08292699107705576,
357
+ "grad_norm": 0.8229078650474548,
358
+ "learning_rate": 9.946928486135066e-05,
359
+ "loss": 0.2783,
360
+ "step": 2500
361
+ },
362
+ {
363
+ "epoch": 0.08458553089859687,
364
+ "grad_norm": 1.0346490144729614,
365
+ "learning_rate": 0.00010145946663128565,
366
+ "loss": 0.2774,
367
+ "step": 2550
368
+ },
369
+ {
370
+ "epoch": 0.086244070720138,
371
+ "grad_norm": 1.1092647314071655,
372
+ "learning_rate": 0.00010344964840122063,
373
+ "loss": 0.254,
374
+ "step": 2600
375
+ },
376
+ {
377
+ "epoch": 0.08790261054167911,
378
+ "grad_norm": 1.0459065437316895,
379
+ "learning_rate": 0.00010543983017115562,
380
+ "loss": 0.2907,
381
+ "step": 2650
382
+ },
383
+ {
384
+ "epoch": 0.08956115036322022,
385
+ "grad_norm": 0.8515977263450623,
386
+ "learning_rate": 0.00010743001194109062,
387
+ "loss": 0.2767,
388
+ "step": 2700
389
+ },
390
+ {
391
+ "epoch": 0.09121969018476134,
392
+ "grad_norm": 0.8556401133537292,
393
+ "learning_rate": 0.0001094201937110256,
394
+ "loss": 0.2657,
395
+ "step": 2750
396
+ },
397
+ {
398
+ "epoch": 0.09287823000630245,
399
+ "grad_norm": 0.9892629384994507,
400
+ "learning_rate": 0.00011141037548096058,
401
+ "loss": 0.2635,
402
+ "step": 2800
403
+ },
404
+ {
405
+ "epoch": 0.09453676982784356,
406
+ "grad_norm": 0.8110876679420471,
407
+ "learning_rate": 0.00011340055725089557,
408
+ "loss": 0.2783,
409
+ "step": 2850
410
+ },
411
+ {
412
+ "epoch": 0.09619530964938468,
413
+ "grad_norm": 0.9971825480461121,
414
+ "learning_rate": 0.00011539073902083055,
415
+ "loss": 0.2723,
416
+ "step": 2900
417
+ },
418
+ {
419
+ "epoch": 0.09785384947092579,
420
+ "grad_norm": 0.8159605860710144,
421
+ "learning_rate": 0.00011738092079076555,
422
+ "loss": 0.2904,
423
+ "step": 2950
424
+ },
425
+ {
426
+ "epoch": 0.09951238929246692,
427
+ "grad_norm": 0.8371638655662537,
428
+ "learning_rate": 0.00011937110256070054,
429
+ "loss": 0.2725,
430
+ "step": 3000
431
+ },
432
+ {
433
+ "epoch": 0.10117092911400803,
434
+ "grad_norm": 0.9144733548164368,
435
+ "learning_rate": 0.00012136128433063553,
436
+ "loss": 0.2733,
437
+ "step": 3050
438
+ },
439
+ {
440
+ "epoch": 0.10282946893554915,
441
+ "grad_norm": 0.9184954762458801,
442
+ "learning_rate": 0.0001233514661005705,
443
+ "loss": 0.2778,
444
+ "step": 3100
445
+ },
446
+ {
447
+ "epoch": 0.10448800875709026,
448
+ "grad_norm": 0.760306179523468,
449
+ "learning_rate": 0.00012534164787050549,
450
+ "loss": 0.2714,
451
+ "step": 3150
452
+ },
453
+ {
454
+ "epoch": 0.10614654857863137,
455
+ "grad_norm": 0.7679548263549805,
456
+ "learning_rate": 0.00012733182964044047,
457
+ "loss": 0.2607,
458
+ "step": 3200
459
+ },
460
+ {
461
+ "epoch": 0.10780508840017249,
462
+ "grad_norm": 0.899803876876831,
463
+ "learning_rate": 0.00012932201141037549,
464
+ "loss": 0.2572,
465
+ "step": 3250
466
+ },
467
+ {
468
+ "epoch": 0.1094636282217136,
469
+ "grad_norm": 0.7616294026374817,
470
+ "learning_rate": 0.00013131219318031047,
471
+ "loss": 0.2733,
472
+ "step": 3300
473
+ },
474
+ {
475
+ "epoch": 0.11112216804325471,
476
+ "grad_norm": 0.8336801528930664,
477
+ "learning_rate": 0.00013330237495024543,
478
+ "loss": 0.274,
479
+ "step": 3350
480
+ },
481
+ {
482
+ "epoch": 0.11278070786479584,
483
+ "grad_norm": 0.7640217542648315,
484
+ "learning_rate": 0.00013529255672018042,
485
+ "loss": 0.2784,
486
+ "step": 3400
487
+ },
488
+ {
489
+ "epoch": 0.11443924768633695,
490
+ "grad_norm": 0.7056282758712769,
491
+ "learning_rate": 0.0001372827384901154,
492
+ "loss": 0.2639,
493
+ "step": 3450
494
+ },
495
+ {
496
+ "epoch": 0.11609778750787807,
497
+ "grad_norm": 0.8658528327941895,
498
+ "learning_rate": 0.00013927292026005042,
499
+ "loss": 0.2681,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "epoch": 0.11775632732941918,
504
+ "grad_norm": 0.802803099155426,
505
+ "learning_rate": 0.0001412631020299854,
506
+ "loss": 0.2963,
507
+ "step": 3550
508
+ },
509
+ {
510
+ "epoch": 0.1194148671509603,
511
+ "grad_norm": 0.8917680978775024,
512
+ "learning_rate": 0.0001432532837999204,
513
+ "loss": 0.2826,
514
+ "step": 3600
515
+ },
516
+ {
517
+ "epoch": 0.12107340697250141,
518
+ "grad_norm": 0.7213064432144165,
519
+ "learning_rate": 0.00014524346556985538,
520
+ "loss": 0.2758,
521
+ "step": 3650
522
+ },
523
+ {
524
+ "epoch": 0.12273194679404252,
525
+ "grad_norm": 0.6630449891090393,
526
+ "learning_rate": 0.00014723364733979034,
527
+ "loss": 0.2669,
528
+ "step": 3700
529
+ },
530
+ {
531
+ "epoch": 0.12439048661558363,
532
+ "grad_norm": 0.9187581539154053,
533
+ "learning_rate": 0.00014922382910972532,
534
+ "loss": 0.2664,
535
+ "step": 3750
536
+ },
537
+ {
538
+ "epoch": 0.12604902643712476,
539
+ "grad_norm": 0.7035880088806152,
540
+ "learning_rate": 0.0001512140108796603,
541
+ "loss": 0.2813,
542
+ "step": 3800
543
+ },
544
+ {
545
+ "epoch": 0.12770756625866586,
546
+ "grad_norm": 0.9671415686607361,
547
+ "learning_rate": 0.0001532041926495953,
548
+ "loss": 0.2892,
549
+ "step": 3850
550
+ },
551
+ {
552
+ "epoch": 0.129366106080207,
553
+ "grad_norm": 0.809038519859314,
554
+ "learning_rate": 0.00015519437441953028,
555
+ "loss": 0.2662,
556
+ "step": 3900
557
+ },
558
+ {
559
+ "epoch": 0.1310246459017481,
560
+ "grad_norm": 0.8309035301208496,
561
+ "learning_rate": 0.0001571845561894653,
562
+ "loss": 0.2567,
563
+ "step": 3950
564
+ },
565
+ {
566
+ "epoch": 0.13268318572328922,
567
+ "grad_norm": 0.7343366146087646,
568
+ "learning_rate": 0.00015917473795940028,
569
+ "loss": 0.2789,
570
+ "step": 4000
571
+ },
572
+ {
573
+ "epoch": 0.13434172554483034,
574
+ "grad_norm": 0.6523250341415405,
575
+ "learning_rate": 0.00016116491972933527,
576
+ "loss": 0.2568,
577
+ "step": 4050
578
+ },
579
+ {
580
+ "epoch": 0.13600026536637144,
581
+ "grad_norm": 0.6583340167999268,
582
+ "learning_rate": 0.00016315510149927026,
583
+ "loss": 0.2694,
584
+ "step": 4100
585
+ },
586
+ {
587
+ "epoch": 0.13765880518791257,
588
+ "grad_norm": 0.674410879611969,
589
+ "learning_rate": 0.00016514528326920524,
590
+ "loss": 0.2839,
591
+ "step": 4150
592
+ },
593
+ {
594
+ "epoch": 0.13931734500945367,
595
+ "grad_norm": 0.9803613424301147,
596
+ "learning_rate": 0.00016713546503914023,
597
+ "loss": 0.2789,
598
+ "step": 4200
599
+ },
600
+ {
601
+ "epoch": 0.1409758848309948,
602
+ "grad_norm": 0.6113103032112122,
603
+ "learning_rate": 0.00016912564680907522,
604
+ "loss": 0.263,
605
+ "step": 4250
606
+ },
607
+ {
608
+ "epoch": 0.1426344246525359,
609
+ "grad_norm": 0.6843391060829163,
610
+ "learning_rate": 0.0001711158285790102,
611
+ "loss": 0.2673,
612
+ "step": 4300
613
+ },
614
+ {
615
+ "epoch": 0.14429296447407702,
616
+ "grad_norm": 0.7611909508705139,
617
+ "learning_rate": 0.0001731060103489452,
618
+ "loss": 0.2628,
619
+ "step": 4350
620
+ },
621
+ {
622
+ "epoch": 0.14595150429561815,
623
+ "grad_norm": 0.7338739037513733,
624
+ "learning_rate": 0.00017509619211888015,
625
+ "loss": 0.2728,
626
+ "step": 4400
627
+ },
628
+ {
629
+ "epoch": 0.14761004411715925,
630
+ "grad_norm": 0.6578771471977234,
631
+ "learning_rate": 0.00017708637388881514,
632
+ "loss": 0.2503,
633
+ "step": 4450
634
+ },
635
+ {
636
+ "epoch": 0.14926858393870038,
637
+ "grad_norm": 0.6933615803718567,
638
+ "learning_rate": 0.00017907655565875018,
639
+ "loss": 0.2655,
640
+ "step": 4500
641
+ },
642
+ {
643
+ "epoch": 0.15092712376024148,
644
+ "grad_norm": 0.574677586555481,
645
+ "learning_rate": 0.00018106673742868514,
646
+ "loss": 0.2545,
647
+ "step": 4550
648
+ },
649
+ {
650
+ "epoch": 0.1525856635817826,
651
+ "grad_norm": 0.5872885584831238,
652
+ "learning_rate": 0.00018305691919862012,
653
+ "loss": 0.2728,
654
+ "step": 4600
655
+ },
656
+ {
657
+ "epoch": 0.1542442034033237,
658
+ "grad_norm": 0.7030934691429138,
659
+ "learning_rate": 0.0001850471009685551,
660
+ "loss": 0.2691,
661
+ "step": 4650
662
+ },
663
+ {
664
+ "epoch": 0.15590274322486483,
665
+ "grad_norm": 0.6045774817466736,
666
+ "learning_rate": 0.0001870372827384901,
667
+ "loss": 0.2676,
668
+ "step": 4700
669
+ },
670
+ {
671
+ "epoch": 0.15756128304640593,
672
+ "grad_norm": 0.6607248783111572,
673
+ "learning_rate": 0.00018902746450842508,
674
+ "loss": 0.2706,
675
+ "step": 4750
676
+ },
677
+ {
678
+ "epoch": 0.15921982286794706,
679
+ "grad_norm": 0.6813399195671082,
680
+ "learning_rate": 0.00019101764627836007,
681
+ "loss": 0.2602,
682
+ "step": 4800
683
+ },
684
+ {
685
+ "epoch": 0.16087836268948819,
686
+ "grad_norm": 0.6294276714324951,
687
+ "learning_rate": 0.00019300782804829506,
688
+ "loss": 0.2861,
689
+ "step": 4850
690
+ },
691
+ {
692
+ "epoch": 0.16253690251102929,
693
+ "grad_norm": 0.7689523696899414,
694
+ "learning_rate": 0.00019499800981823004,
695
+ "loss": 0.2919,
696
+ "step": 4900
697
+ },
698
+ {
699
+ "epoch": 0.1641954423325704,
700
+ "grad_norm": 0.6293140053749084,
701
+ "learning_rate": 0.00019698819158816503,
702
+ "loss": 0.2625,
703
+ "step": 4950
704
+ },
705
+ {
706
+ "epoch": 0.1658539821541115,
707
+ "grad_norm": 0.6510013937950134,
708
+ "learning_rate": 0.00019897837335810002,
709
+ "loss": 0.2645,
710
+ "step": 5000
711
+ },
712
+ {
713
+ "epoch": 0.16751252197565264,
714
+ "grad_norm": 0.655305027961731,
715
+ "learning_rate": 0.00020096855512803503,
716
+ "loss": 0.2899,
717
+ "step": 5050
718
+ },
719
+ {
720
+ "epoch": 0.16917106179719374,
721
+ "grad_norm": 0.6357027888298035,
722
+ "learning_rate": 0.00020295873689797002,
723
+ "loss": 0.2841,
724
+ "step": 5100
725
+ },
726
+ {
727
+ "epoch": 0.17082960161873487,
728
+ "grad_norm": 0.6427227258682251,
729
+ "learning_rate": 0.000204948918667905,
730
+ "loss": 0.2904,
731
+ "step": 5150
732
+ },
733
+ {
734
+ "epoch": 0.172488141440276,
735
+ "grad_norm": 0.5456770658493042,
736
+ "learning_rate": 0.00020693910043784,
737
+ "loss": 0.2606,
738
+ "step": 5200
739
+ },
740
+ {
741
+ "epoch": 0.1741466812618171,
742
+ "grad_norm": 0.6049277186393738,
743
+ "learning_rate": 0.00020892928220777495,
744
+ "loss": 0.2789,
745
+ "step": 5250
746
+ },
747
+ {
748
+ "epoch": 0.17580522108335822,
749
+ "grad_norm": 0.604241669178009,
750
+ "learning_rate": 0.00021091946397770994,
751
+ "loss": 0.2855,
752
+ "step": 5300
753
+ },
754
+ {
755
+ "epoch": 0.17746376090489932,
756
+ "grad_norm": 0.5764302015304565,
757
+ "learning_rate": 0.00021290964574764492,
758
+ "loss": 0.2724,
759
+ "step": 5350
760
+ },
761
+ {
762
+ "epoch": 0.17912230072644045,
763
+ "grad_norm": 0.5926195979118347,
764
+ "learning_rate": 0.0002148998275175799,
765
+ "loss": 0.2693,
766
+ "step": 5400
767
+ },
768
+ {
769
+ "epoch": 0.18078084054798155,
770
+ "grad_norm": 0.550534188747406,
771
+ "learning_rate": 0.0002168900092875149,
772
+ "loss": 0.2739,
773
+ "step": 5450
774
+ },
775
+ {
776
+ "epoch": 0.18243938036952267,
777
+ "grad_norm": 0.9231746792793274,
778
+ "learning_rate": 0.00021888019105744988,
779
+ "loss": 0.2713,
780
+ "step": 5500
781
+ },
782
+ {
783
+ "epoch": 0.18409792019106377,
784
+ "grad_norm": 0.6074857115745544,
785
+ "learning_rate": 0.00022087037282738487,
786
+ "loss": 0.2628,
787
+ "step": 5550
788
+ },
789
+ {
790
+ "epoch": 0.1857564600126049,
791
+ "grad_norm": 0.5123354196548462,
792
+ "learning_rate": 0.00022286055459731988,
793
+ "loss": 0.2752,
794
+ "step": 5600
795
+ },
796
+ {
797
+ "epoch": 0.18741499983414603,
798
+ "grad_norm": 0.5391439199447632,
799
+ "learning_rate": 0.00022485073636725487,
800
+ "loss": 0.2772,
801
+ "step": 5650
802
+ },
803
+ {
804
+ "epoch": 0.18907353965568713,
805
+ "grad_norm": 0.5493964552879333,
806
+ "learning_rate": 0.00022684091813718986,
807
+ "loss": 0.2656,
808
+ "step": 5700
809
+ },
810
+ {
811
+ "epoch": 0.19073207947722826,
812
+ "grad_norm": 0.531689465045929,
813
+ "learning_rate": 0.00022883109990712484,
814
+ "loss": 0.2652,
815
+ "step": 5750
816
+ },
817
+ {
818
+ "epoch": 0.19239061929876936,
819
+ "grad_norm": 0.6406611204147339,
820
+ "learning_rate": 0.00023082128167705983,
821
+ "loss": 0.2807,
822
+ "step": 5800
823
+ },
824
+ {
825
+ "epoch": 0.19404915912031048,
826
+ "grad_norm": 0.5147985219955444,
827
+ "learning_rate": 0.00023281146344699481,
828
+ "loss": 0.2859,
829
+ "step": 5850
830
+ },
831
+ {
832
+ "epoch": 0.19570769894185158,
833
+ "grad_norm": 0.5153710246086121,
834
+ "learning_rate": 0.0002348016452169298,
835
+ "loss": 0.2529,
836
+ "step": 5900
837
+ },
838
+ {
839
+ "epoch": 0.1973662387633927,
840
+ "grad_norm": 0.5277467966079712,
841
+ "learning_rate": 0.00023679182698686476,
842
+ "loss": 0.2731,
843
+ "step": 5950
844
+ },
845
+ {
846
+ "epoch": 0.19902477858493384,
847
+ "grad_norm": 0.480101615190506,
848
+ "learning_rate": 0.00023878200875679975,
849
+ "loss": 0.2784,
850
+ "step": 6000
851
+ },
852
+ {
853
+ "epoch": 0.20068331840647494,
854
+ "grad_norm": 0.5489948987960815,
855
+ "learning_rate": 0.00024077219052673473,
856
+ "loss": 0.2968,
857
+ "step": 6050
858
+ },
859
+ {
860
+ "epoch": 0.20234185822801606,
861
+ "grad_norm": 0.6148312091827393,
862
+ "learning_rate": 0.00024276237229666972,
863
+ "loss": 0.2731,
864
+ "step": 6100
865
+ },
866
+ {
867
+ "epoch": 0.20400039804955716,
868
+ "grad_norm": 0.5747624039649963,
869
+ "learning_rate": 0.00024475255406660476,
870
+ "loss": 0.2701,
871
+ "step": 6150
872
+ },
873
+ {
874
+ "epoch": 0.2056589378710983,
875
+ "grad_norm": 0.6135842800140381,
876
+ "learning_rate": 0.00024674273583653975,
877
+ "loss": 0.2691,
878
+ "step": 6200
879
+ },
880
+ {
881
+ "epoch": 0.2073174776926394,
882
+ "grad_norm": 0.5454925298690796,
883
+ "learning_rate": 0.00024873291760647473,
884
+ "loss": 0.2737,
885
+ "step": 6250
886
+ },
887
+ {
888
+ "epoch": 0.20897601751418052,
889
+ "grad_norm": 0.6004371047019958,
890
+ "learning_rate": 0.0002507230993764097,
891
+ "loss": 0.288,
892
+ "step": 6300
893
+ },
894
+ {
895
+ "epoch": 0.21063455733572162,
896
+ "grad_norm": 0.5263451933860779,
897
+ "learning_rate": 0.0002527132811463447,
898
+ "loss": 0.2752,
899
+ "step": 6350
900
+ },
901
+ {
902
+ "epoch": 0.21229309715726274,
903
+ "grad_norm": 0.5507785677909851,
904
+ "learning_rate": 0.00025470346291627964,
905
+ "loss": 0.2749,
906
+ "step": 6400
907
+ },
908
+ {
909
+ "epoch": 0.21395163697880387,
910
+ "grad_norm": 0.5042358636856079,
911
+ "learning_rate": 0.0002566936446862146,
912
+ "loss": 0.2773,
913
+ "step": 6450
914
+ },
915
+ {
916
+ "epoch": 0.21561017680034497,
917
+ "grad_norm": 0.4615878164768219,
918
+ "learning_rate": 0.0002586838264561496,
919
+ "loss": 0.2782,
920
+ "step": 6500
921
+ },
922
+ {
923
+ "epoch": 0.2172687166218861,
924
+ "grad_norm": 0.5063010454177856,
925
+ "learning_rate": 0.0002606740082260846,
926
+ "loss": 0.2805,
927
+ "step": 6550
928
+ },
929
+ {
930
+ "epoch": 0.2189272564434272,
931
+ "grad_norm": 0.5367996096611023,
932
+ "learning_rate": 0.0002626641899960196,
933
+ "loss": 0.2689,
934
+ "step": 6600
935
+ },
936
+ {
937
+ "epoch": 0.22058579626496833,
938
+ "grad_norm": 0.4879344701766968,
939
+ "learning_rate": 0.0002646543717659546,
940
+ "loss": 0.2745,
941
+ "step": 6650
942
+ },
943
+ {
944
+ "epoch": 0.22224433608650943,
945
+ "grad_norm": 0.504068911075592,
946
+ "learning_rate": 0.00026664455353588956,
947
+ "loss": 0.2685,
948
+ "step": 6700
949
+ },
950
+ {
951
+ "epoch": 0.22390287590805055,
952
+ "grad_norm": 0.4813278913497925,
953
+ "learning_rate": 0.0002686347353058246,
954
+ "loss": 0.261,
955
+ "step": 6750
956
+ },
957
+ {
958
+ "epoch": 0.22556141572959168,
959
+ "grad_norm": 0.46839526295661926,
960
+ "learning_rate": 0.0002706249170757596,
961
+ "loss": 0.2702,
962
+ "step": 6800
963
+ },
964
+ {
965
+ "epoch": 0.22721995555113278,
966
+ "grad_norm": 0.5233504176139832,
967
+ "learning_rate": 0.0002726150988456946,
968
+ "loss": 0.2736,
969
+ "step": 6850
970
+ },
971
+ {
972
+ "epoch": 0.2288784953726739,
973
+ "grad_norm": 0.5277813673019409,
974
+ "learning_rate": 0.00027460528061562956,
975
+ "loss": 0.2772,
976
+ "step": 6900
977
+ },
978
+ {
979
+ "epoch": 0.230537035194215,
980
+ "grad_norm": 0.6029180884361267,
981
+ "learning_rate": 0.00027659546238556455,
982
+ "loss": 0.2768,
983
+ "step": 6950
984
+ },
985
+ {
986
+ "epoch": 0.23219557501575613,
987
+ "grad_norm": 0.575391948223114,
988
+ "learning_rate": 0.00027858564415549953,
989
+ "loss": 0.265,
990
+ "step": 7000
991
+ },
992
+ {
993
+ "epoch": 0.23385411483729723,
994
+ "grad_norm": 0.4373869299888611,
995
+ "learning_rate": 0.0002805758259254345,
996
+ "loss": 0.2657,
997
+ "step": 7050
998
+ },
999
+ {
1000
+ "epoch": 0.23551265465883836,
1001
+ "grad_norm": 0.8708984851837158,
1002
+ "learning_rate": 0.00028256600769536945,
1003
+ "loss": 0.2754,
1004
+ "step": 7100
1005
+ },
1006
+ {
1007
+ "epoch": 0.23717119448037946,
1008
+ "grad_norm": 0.5389964580535889,
1009
+ "learning_rate": 0.00028455618946530444,
1010
+ "loss": 0.2654,
1011
+ "step": 7150
1012
+ },
1013
+ {
1014
+ "epoch": 0.2388297343019206,
1015
+ "grad_norm": 0.5265533328056335,
1016
+ "learning_rate": 0.0002865463712352394,
1017
+ "loss": 0.2943,
1018
+ "step": 7200
1019
+ },
1020
+ {
1021
+ "epoch": 0.24048827412346171,
1022
+ "grad_norm": 0.43175622820854187,
1023
+ "learning_rate": 0.0002885365530051744,
1024
+ "loss": 0.2597,
1025
+ "step": 7250
1026
+ },
1027
+ {
1028
+ "epoch": 0.24214681394500281,
1029
+ "grad_norm": 0.42523327469825745,
1030
+ "learning_rate": 0.00029052673477510945,
1031
+ "loss": 0.281,
1032
+ "step": 7300
1033
+ },
1034
+ {
1035
+ "epoch": 0.24380535376654394,
1036
+ "grad_norm": 0.4567892551422119,
1037
+ "learning_rate": 0.00029251691654504444,
1038
+ "loss": 0.2776,
1039
+ "step": 7350
1040
+ },
1041
+ {
1042
+ "epoch": 0.24546389358808504,
1043
+ "grad_norm": 0.4635256826877594,
1044
+ "learning_rate": 0.0002945070983149794,
1045
+ "loss": 0.2608,
1046
+ "step": 7400
1047
+ },
1048
+ {
1049
+ "epoch": 0.24712243340962617,
1050
+ "grad_norm": 0.43223607540130615,
1051
+ "learning_rate": 0.0002964972800849144,
1052
+ "loss": 0.2577,
1053
+ "step": 7450
1054
+ },
1055
+ {
1056
+ "epoch": 0.24878097323116727,
1057
+ "grad_norm": 0.4766380190849304,
1058
+ "learning_rate": 0.0002984874618548494,
1059
+ "loss": 0.2717,
1060
+ "step": 7500
1061
+ },
1062
+ {
1063
+ "epoch": 0.2504395130527084,
1064
+ "grad_norm": 0.47827085852622986,
1065
+ "learning_rate": 0.0002999999948018406,
1066
+ "loss": 0.2748,
1067
+ "step": 7550
1068
+ },
1069
+ {
1070
+ "epoch": 0.2520980528742495,
1071
+ "grad_norm": 0.5068143606185913,
1072
+ "learning_rate": 0.000299999861238043,
1073
+ "loss": 0.2786,
1074
+ "step": 7600
1075
+ },
1076
+ {
1077
+ "epoch": 0.25375659269579065,
1078
+ "grad_norm": 0.48091641068458557,
1079
+ "learning_rate": 0.0002999995471827833,
1080
+ "loss": 0.2623,
1081
+ "step": 7650
1082
+ },
1083
+ {
1084
+ "epoch": 0.2554151325173317,
1085
+ "grad_norm": 0.5682414770126343,
1086
+ "learning_rate": 0.00029999905263643944,
1087
+ "loss": 0.2784,
1088
+ "step": 7700
1089
+ },
1090
+ {
1091
+ "epoch": 0.25707367233887285,
1092
+ "grad_norm": 0.42487502098083496,
1093
+ "learning_rate": 0.00029999837759960636,
1094
+ "loss": 0.2632,
1095
+ "step": 7750
1096
+ },
1097
+ {
1098
+ "epoch": 0.258732212160414,
1099
+ "grad_norm": 0.3851436674594879,
1100
+ "learning_rate": 0.00029999752207309643,
1101
+ "loss": 0.262,
1102
+ "step": 7800
1103
+ },
1104
+ {
1105
+ "epoch": 0.2603907519819551,
1106
+ "grad_norm": 0.4501873552799225,
1107
+ "learning_rate": 0.000299996486057939,
1108
+ "loss": 0.2579,
1109
+ "step": 7850
1110
+ },
1111
+ {
1112
+ "epoch": 0.2620492918034962,
1113
+ "grad_norm": 0.42192208766937256,
1114
+ "learning_rate": 0.00029999526955538073,
1115
+ "loss": 0.2651,
1116
+ "step": 7900
1117
+ },
1118
+ {
1119
+ "epoch": 0.2637078316250373,
1120
+ "grad_norm": 0.3992311656475067,
1121
+ "learning_rate": 0.0002999938725668854,
1122
+ "loss": 0.2687,
1123
+ "step": 7950
1124
+ },
1125
+ {
1126
+ "epoch": 0.26536637144657843,
1127
+ "grad_norm": 0.4475759267807007,
1128
+ "learning_rate": 0.000299992295094134,
1129
+ "loss": 0.262,
1130
+ "step": 8000
1131
+ },
1132
+ {
1133
+ "epoch": 0.26702491126811956,
1134
+ "grad_norm": 1.0701886415481567,
1135
+ "learning_rate": 0.00029999053713902464,
1136
+ "loss": 0.2702,
1137
+ "step": 8050
1138
+ },
1139
+ {
1140
+ "epoch": 0.2686834510896607,
1141
+ "grad_norm": 0.7080543637275696,
1142
+ "learning_rate": 0.00029998859870367263,
1143
+ "loss": 0.2752,
1144
+ "step": 8100
1145
+ },
1146
+ {
1147
+ "epoch": 0.27034199091120176,
1148
+ "grad_norm": 0.47246304154396057,
1149
+ "learning_rate": 0.0002999864797904105,
1150
+ "loss": 0.2708,
1151
+ "step": 8150
1152
+ },
1153
+ {
1154
+ "epoch": 0.2720005307327429,
1155
+ "grad_norm": 0.3701266348361969,
1156
+ "learning_rate": 0.0002999841804017878,
1157
+ "loss": 0.2607,
1158
+ "step": 8200
1159
+ },
1160
+ {
1161
+ "epoch": 0.273659070554284,
1162
+ "grad_norm": 0.41229259967803955,
1163
+ "learning_rate": 0.00029998170054057135,
1164
+ "loss": 0.2618,
1165
+ "step": 8250
1166
+ },
1167
+ {
1168
+ "epoch": 0.27531761037582514,
1169
+ "grad_norm": 0.41077643632888794,
1170
+ "learning_rate": 0.00029997904020974517,
1171
+ "loss": 0.2554,
1172
+ "step": 8300
1173
+ },
1174
+ {
1175
+ "epoch": 0.27697615019736627,
1176
+ "grad_norm": 0.36561328172683716,
1177
+ "learning_rate": 0.0002999761994125103,
1178
+ "loss": 0.2764,
1179
+ "step": 8350
1180
+ },
1181
+ {
1182
+ "epoch": 0.27863469001890734,
1183
+ "grad_norm": 0.4316375255584717,
1184
+ "learning_rate": 0.00029997317815228506,
1185
+ "loss": 0.2678,
1186
+ "step": 8400
1187
+ },
1188
+ {
1189
+ "epoch": 0.28029322984044847,
1190
+ "grad_norm": 0.4210038483142853,
1191
+ "learning_rate": 0.0002999699764327049,
1192
+ "loss": 0.2462,
1193
+ "step": 8450
1194
+ },
1195
+ {
1196
+ "epoch": 0.2819517696619896,
1197
+ "grad_norm": 0.4256073236465454,
1198
+ "learning_rate": 0.0002999665942576223,
1199
+ "loss": 0.271,
1200
+ "step": 8500
1201
+ },
1202
+ {
1203
+ "epoch": 0.2836103094835307,
1204
+ "grad_norm": 0.40567606687545776,
1205
+ "learning_rate": 0.000299963031631107,
1206
+ "loss": 0.2574,
1207
+ "step": 8550
1208
+ },
1209
+ {
1210
+ "epoch": 0.2852688493050718,
1211
+ "grad_norm": 0.3660525381565094,
1212
+ "learning_rate": 0.00029995928855744577,
1213
+ "loss": 0.2697,
1214
+ "step": 8600
1215
+ },
1216
+ {
1217
+ "epoch": 0.2869273891266129,
1218
+ "grad_norm": 0.44151097536087036,
1219
+ "learning_rate": 0.0002999553650411427,
1220
+ "loss": 0.2508,
1221
+ "step": 8650
1222
+ },
1223
+ {
1224
+ "epoch": 0.28858592894815405,
1225
+ "grad_norm": 0.38013362884521484,
1226
+ "learning_rate": 0.00029995126108691865,
1227
+ "loss": 0.237,
1228
+ "step": 8700
1229
+ },
1230
+ {
1231
+ "epoch": 0.2902444687696952,
1232
+ "grad_norm": 0.44534966349601746,
1233
+ "learning_rate": 0.00029994697669971204,
1234
+ "loss": 0.2575,
1235
+ "step": 8750
1236
+ },
1237
+ {
1238
+ "epoch": 0.2919030085912363,
1239
+ "grad_norm": 0.40814974904060364,
1240
+ "learning_rate": 0.00029994251188467813,
1241
+ "loss": 0.2494,
1242
+ "step": 8800
1243
+ },
1244
+ {
1245
+ "epoch": 0.2935615484127774,
1246
+ "grad_norm": 0.3318258821964264,
1247
+ "learning_rate": 0.0002999378666471892,
1248
+ "loss": 0.2381,
1249
+ "step": 8850
1250
+ },
1251
+ {
1252
+ "epoch": 0.2952200882343185,
1253
+ "grad_norm": 0.43739181756973267,
1254
+ "learning_rate": 0.00029993304099283483,
1255
+ "loss": 0.2388,
1256
+ "step": 8900
1257
+ },
1258
+ {
1259
+ "epoch": 0.29687862805585963,
1260
+ "grad_norm": 0.3652569055557251,
1261
+ "learning_rate": 0.00029992803492742166,
1262
+ "loss": 0.2678,
1263
+ "step": 8950
1264
+ },
1265
+ {
1266
+ "epoch": 0.29853716787740076,
1267
+ "grad_norm": 0.37581753730773926,
1268
+ "learning_rate": 0.0002999228484569734,
1269
+ "loss": 0.258,
1270
+ "step": 9000
1271
+ },
1272
+ {
1273
+ "epoch": 0.3001957076989418,
1274
+ "grad_norm": 0.36842867732048035,
1275
+ "learning_rate": 0.0002999174815877307,
1276
+ "loss": 0.241,
1277
+ "step": 9050
1278
+ },
1279
+ {
1280
+ "epoch": 0.30185424752048295,
1281
+ "grad_norm": 0.36468687653541565,
1282
+ "learning_rate": 0.0002999119343261515,
1283
+ "loss": 0.2601,
1284
+ "step": 9100
1285
+ },
1286
+ {
1287
+ "epoch": 0.3035127873420241,
1288
+ "grad_norm": 0.44215506315231323,
1289
+ "learning_rate": 0.0002999062066789106,
1290
+ "loss": 0.2577,
1291
+ "step": 9150
1292
+ },
1293
+ {
1294
+ "epoch": 0.3051713271635652,
1295
+ "grad_norm": 0.379226952791214,
1296
+ "learning_rate": 0.00029990029865290007,
1297
+ "loss": 0.2509,
1298
+ "step": 9200
1299
+ },
1300
+ {
1301
+ "epoch": 0.30682986698510634,
1302
+ "grad_norm": 0.3723479211330414,
1303
+ "learning_rate": 0.0002998942102552288,
1304
+ "loss": 0.242,
1305
+ "step": 9250
1306
+ },
1307
+ {
1308
+ "epoch": 0.3084884068066474,
1309
+ "grad_norm": 0.34578976035118103,
1310
+ "learning_rate": 0.00029988794149322283,
1311
+ "loss": 0.2791,
1312
+ "step": 9300
1313
+ },
1314
+ {
1315
+ "epoch": 0.31014694662818854,
1316
+ "grad_norm": 0.33455216884613037,
1317
+ "learning_rate": 0.0002998814923744253,
1318
+ "loss": 0.2618,
1319
+ "step": 9350
1320
+ },
1321
+ {
1322
+ "epoch": 0.31180548644972966,
1323
+ "grad_norm": 0.3505556583404541,
1324
+ "learning_rate": 0.0002998748629065962,
1325
+ "loss": 0.2416,
1326
+ "step": 9400
1327
+ },
1328
+ {
1329
+ "epoch": 0.3134640262712708,
1330
+ "grad_norm": 0.3335639536380768,
1331
+ "learning_rate": 0.00029986805309771276,
1332
+ "loss": 0.2321,
1333
+ "step": 9450
1334
+ },
1335
+ {
1336
+ "epoch": 0.31512256609281186,
1337
+ "grad_norm": 0.31904950737953186,
1338
+ "learning_rate": 0.00029986106295596884,
1339
+ "loss": 0.2405,
1340
+ "step": 9500
1341
+ },
1342
+ {
1343
+ "epoch": 0.316781105914353,
1344
+ "grad_norm": 0.41776758432388306,
1345
+ "learning_rate": 0.0002998538924897757,
1346
+ "loss": 0.2359,
1347
+ "step": 9550
1348
+ },
1349
+ {
1350
+ "epoch": 0.3184396457358941,
1351
+ "grad_norm": 0.3445201814174652,
1352
+ "learning_rate": 0.0002998465417077613,
1353
+ "loss": 0.2451,
1354
+ "step": 9600
1355
+ },
1356
+ {
1357
+ "epoch": 0.32009818555743524,
1358
+ "grad_norm": 0.36852407455444336,
1359
+ "learning_rate": 0.0002998390106187708,
1360
+ "loss": 0.238,
1361
+ "step": 9650
1362
+ },
1363
+ {
1364
+ "epoch": 0.32175672537897637,
1365
+ "grad_norm": 0.3637458384037018,
1366
+ "learning_rate": 0.00029983129923186614,
1367
+ "loss": 0.2286,
1368
+ "step": 9700
1369
+ },
1370
+ {
1371
+ "epoch": 0.32341526520051744,
1372
+ "grad_norm": 0.35523203015327454,
1373
+ "learning_rate": 0.00029982340755632615,
1374
+ "loss": 0.2649,
1375
+ "step": 9750
1376
+ },
1377
+ {
1378
+ "epoch": 0.32507380502205857,
1379
+ "grad_norm": 0.4552542567253113,
1380
+ "learning_rate": 0.00029981533560164683,
1381
+ "loss": 0.2462,
1382
+ "step": 9800
1383
+ },
1384
+ {
1385
+ "epoch": 0.3267323448435997,
1386
+ "grad_norm": 0.3182304799556732,
1387
+ "learning_rate": 0.0002998070833775409,
1388
+ "loss": 0.259,
1389
+ "step": 9850
1390
+ },
1391
+ {
1392
+ "epoch": 0.3283908846651408,
1393
+ "grad_norm": 0.3835619390010834,
1394
+ "learning_rate": 0.0002997986508939382,
1395
+ "loss": 0.2628,
1396
+ "step": 9900
1397
+ },
1398
+ {
1399
+ "epoch": 0.33004942448668195,
1400
+ "grad_norm": 0.3364975154399872,
1401
+ "learning_rate": 0.00029979003816098514,
1402
+ "loss": 0.2512,
1403
+ "step": 9950
1404
+ },
1405
+ {
1406
+ "epoch": 0.331707964308223,
1407
+ "grad_norm": 0.3546120822429657,
1408
+ "learning_rate": 0.0002997812451890454,
1409
+ "loss": 0.2312,
1410
+ "step": 10000
1411
+ }
1412
+ ],
1413
+ "logging_steps": 50,
1414
+ "max_steps": 150735,
1415
+ "num_input_tokens_seen": 0,
1416
+ "num_train_epochs": 5,
1417
+ "save_steps": 10000,
1418
+ "stateful_callbacks": {
1419
+ "TrainerControl": {
1420
+ "args": {
1421
+ "should_epoch_stop": false,
1422
+ "should_evaluate": false,
1423
+ "should_log": false,
1424
+ "should_save": true,
1425
+ "should_training_stop": false
1426
+ },
1427
+ "attributes": {}
1428
+ }
1429
+ },
1430
+ "total_flos": 0.0,
1431
+ "train_batch_size": 16,
1432
+ "trial_name": null,
1433
+ "trial_params": null
1434
+ }