Rubywong123 commited on
Commit
92af847
·
verified ·
1 Parent(s): 016ea25

Upload folder using huggingface_hub

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.9709543568464731,
3
- "total_flos": 2.029407915034542e+17,
4
- "train_loss": 0.3740218333899975,
5
- "train_runtime": 1097.5227,
6
- "train_samples": 1927,
7
- "train_samples_per_second": 3.512,
8
- "train_steps_per_second": 0.073
9
  }
 
1
  {
2
  "epoch": 1.9709543568464731,
3
+ "total_flos": 2.0214124294176768e+17,
4
+ "train_loss": 0.371533726900816,
5
+ "train_runtime": 1201.5473,
6
+ "train_samples": 1925,
7
+ "train_samples_per_second": 3.204,
8
+ "train_steps_per_second": 0.067
9
  }
checkpoint-80/global_step79/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62b22a56d979b4c26809ef202f3631defcfde36d400e4794095dfdac4dbcaf57
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2257005a35eb6d2d6b80dadf25283d732569dedbe0117a1fd209082a4199ece
3
  size 24090788620
checkpoint-80/global_step79/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce8eec1c04ea7a3a40461c28a5208b9da782c17fb174d40403c11755ac4c16ab
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d50510026f065bb8f101221bfd4cfb205989c018ee21ad2e1786f56e906249f7
3
  size 24090788620
checkpoint-80/global_step79/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7d8b55ce4d7849ab53b38fdd082214f0990ac6e08874b6fe1419fd218df73cb
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3d4cef230b9d567afef3f5aca67f19ec0b7348a8ecad6764fe048271e596a9a
3
  size 24090788620
checkpoint-80/global_step79/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:637628d474355549e029cbfbc43763a302b3a79f4be352e4b39261d861d3cb41
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c796d4ff1b19adbb43b0f5473fc2c42696a6a1b5cb8f3cfb02b7ea5926abc375
3
  size 24090788620
checkpoint-80/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38e418375f069dc0a6e22dbe1f9fe2e9c1a35326401000926e39618012daef28
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c6e909727e20263eb587685655ef8609a2f4b6e653cb0678f747e2125b76fb7
3
  size 4976698672
checkpoint-80/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d20cd5d2b6d4fb439d5605863c4c5c974add43f60501ccb272f77d725ea63a9e
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba713efd72a7e5a23af5094c271658a5e9f240179cda56d6144a6e43ec878c2
3
  size 4999802720
checkpoint-80/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baf527bb1b8e21c120d29d08c1f65101e6dae78a7f16b082b592f156b54cde5d
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5925b2fee006cee5ccc1d208289db4b40b34930bb14ff7f78e0519a6d71dafb5
3
  size 4915916176
checkpoint-80/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2a64cb51e1a2804c06ce816c5c8aaf83f6f2b8cc3fbd1a4ad85549c3c263fe5
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba94e44e7c61f5667b9542de32dde71b1e571b4291a68a5b4baa5e905aea305
3
  size 1168138808
checkpoint-80/trainer_state.json CHANGED
@@ -10,137 +10,137 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.024896265560165973,
13
- "grad_norm": 0.09802406423206611,
14
  "learning_rate": 1.25e-06,
15
- "loss": 0.4475,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.12448132780082988,
20
- "grad_norm": 0.06629266158913129,
21
  "learning_rate": 6.25e-06,
22
- "loss": 0.4034,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.24896265560165975,
27
- "grad_norm": 0.07682971594488436,
28
  "learning_rate": 9.980973490458728e-06,
29
- "loss": 0.447,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.37344398340248963,
34
- "grad_norm": 0.06952920500301711,
35
  "learning_rate": 9.768584753741134e-06,
36
- "loss": 0.4512,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.4979253112033195,
41
- "grad_norm": 0.05665883883073202,
42
  "learning_rate": 9.330127018922195e-06,
43
- "loss": 0.4115,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6224066390041494,
48
- "grad_norm": 0.05543531120968404,
49
  "learning_rate": 8.68638668405062e-06,
50
- "loss": 0.4224,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7468879668049793,
55
- "grad_norm": 0.05055338657549615,
56
  "learning_rate": 7.86788218175523e-06,
57
- "loss": 0.3645,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8713692946058091,
62
- "grad_norm": 0.04640618215423049,
63
  "learning_rate": 6.913417161825449e-06,
64
- "loss": 0.3693,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.995850622406639,
69
- "grad_norm": 0.04834379140546879,
70
  "learning_rate": 5.8682408883346535e-06,
71
- "loss": 0.3722,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
- "eval_loss": 0.4088011085987091,
77
- "eval_runtime": 28.7977,
78
- "eval_samples_per_second": 19.064,
79
- "eval_steps_per_second": 4.792,
80
  "step": 41
81
  },
82
  {
83
  "epoch": 1.099585062240664,
84
- "grad_norm": 0.0437345911401945,
85
  "learning_rate": 4.781903063173321e-06,
86
- "loss": 0.3264,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2240663900414939,
91
- "grad_norm": 0.0435343363130576,
92
  "learning_rate": 3.705904774487396e-06,
93
- "loss": 0.3432,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3485477178423237,
98
- "grad_norm": 0.038958225706696346,
99
  "learning_rate": 2.6912569338248317e-06,
100
- "loss": 0.3608,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4730290456431536,
105
- "grad_norm": 0.043127252621560926,
106
  "learning_rate": 1.7860619515673034e-06,
107
- "loss": 0.3517,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5975103734439835,
112
- "grad_norm": 0.040329157413075245,
113
  "learning_rate": 1.0332332985438248e-06,
114
- "loss": 0.3471,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.7219917012448134,
119
- "grad_norm": 0.041243372965840595,
120
  "learning_rate": 4.6846106481675035e-07,
121
- "loss": 0.3397,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8464730290456433,
126
- "grad_norm": 0.03676264140179849,
127
  "learning_rate": 1.185199644003332e-07,
128
- "loss": 0.3206,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9709543568464731,
133
- "grad_norm": 0.0398501802157299,
134
  "learning_rate": 0.0,
135
- "loss": 0.3444,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.9709543568464731,
140
- "eval_loss": 0.40992262959480286,
141
- "eval_runtime": 28.1962,
142
- "eval_samples_per_second": 19.471,
143
- "eval_steps_per_second": 4.894,
144
  "step": 80
145
  }
146
  ],
@@ -161,7 +161,7 @@
161
  "attributes": {}
162
  }
163
  },
164
- "total_flos": 2.029407915034542e+17,
165
  "train_batch_size": 1,
166
  "trial_name": null,
167
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.024896265560165973,
13
+ "grad_norm": 0.11021283446475846,
14
  "learning_rate": 1.25e-06,
15
+ "loss": 0.4278,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.12448132780082988,
20
+ "grad_norm": 0.0774790715838221,
21
  "learning_rate": 6.25e-06,
22
+ "loss": 0.4499,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.24896265560165975,
27
+ "grad_norm": 0.08040067775763293,
28
  "learning_rate": 9.980973490458728e-06,
29
+ "loss": 0.4082,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.37344398340248963,
34
+ "grad_norm": 0.07045048232950048,
35
  "learning_rate": 9.768584753741134e-06,
36
+ "loss": 0.4303,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.4979253112033195,
41
+ "grad_norm": 0.05522617085677978,
42
  "learning_rate": 9.330127018922195e-06,
43
+ "loss": 0.3962,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6224066390041494,
48
+ "grad_norm": 0.04920532366631061,
49
  "learning_rate": 8.68638668405062e-06,
50
+ "loss": 0.3735,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7468879668049793,
55
+ "grad_norm": 0.052532219713063856,
56
  "learning_rate": 7.86788218175523e-06,
57
+ "loss": 0.4016,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8713692946058091,
62
+ "grad_norm": 0.05694643147496267,
63
  "learning_rate": 6.913417161825449e-06,
64
+ "loss": 0.3946,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.995850622406639,
69
+ "grad_norm": 0.04732918589868144,
70
  "learning_rate": 5.8682408883346535e-06,
71
+ "loss": 0.37,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "eval_loss": 0.40875306725502014,
77
+ "eval_runtime": 29.4951,
78
+ "eval_samples_per_second": 18.613,
79
+ "eval_steps_per_second": 4.679,
80
  "step": 41
81
  },
82
  {
83
  "epoch": 1.099585062240664,
84
+ "grad_norm": 0.042329664384282324,
85
  "learning_rate": 4.781903063173321e-06,
86
+ "loss": 0.3745,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2240663900414939,
91
+ "grad_norm": 0.042311381287778824,
92
  "learning_rate": 3.705904774487396e-06,
93
+ "loss": 0.3656,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3485477178423237,
98
+ "grad_norm": 0.04211186999995547,
99
  "learning_rate": 2.6912569338248317e-06,
100
+ "loss": 0.3249,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4730290456431536,
105
+ "grad_norm": 0.03859377186467179,
106
  "learning_rate": 1.7860619515673034e-06,
107
+ "loss": 0.3478,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5975103734439835,
112
+ "grad_norm": 0.038411951194646604,
113
  "learning_rate": 1.0332332985438248e-06,
114
+ "loss": 0.3441,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.7219917012448134,
119
+ "grad_norm": 0.036561598740802725,
120
  "learning_rate": 4.6846106481675035e-07,
121
+ "loss": 0.3186,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8464730290456433,
126
+ "grad_norm": 0.045523136033542085,
127
  "learning_rate": 1.185199644003332e-07,
128
+ "loss": 0.3124,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9709543568464731,
133
+ "grad_norm": 0.040621614883517274,
134
  "learning_rate": 0.0,
135
+ "loss": 0.3367,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.9709543568464731,
140
+ "eval_loss": 0.4118167757987976,
141
+ "eval_runtime": 28.4832,
142
+ "eval_samples_per_second": 19.275,
143
+ "eval_steps_per_second": 4.845,
144
  "step": 80
145
  }
146
  ],
 
161
  "attributes": {}
162
  }
163
  },
164
+ "total_flos": 2.0214124294176768e+17,
165
  "train_batch_size": 1,
166
  "trial_name": null,
167
  "trial_params": null
checkpoint-80/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa7e3d947f8f4edfc3f04cb89caba20dd5a6e20c1cb687a46d40fa5f37ff6853
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5091113cdf164b227f0dadd0fa30c76a926f363fc1d6b142a3bca31ae85cc04f
3
  size 7352
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38e418375f069dc0a6e22dbe1f9fe2e9c1a35326401000926e39618012daef28
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c6e909727e20263eb587685655ef8609a2f4b6e653cb0678f747e2125b76fb7
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d20cd5d2b6d4fb439d5605863c4c5c974add43f60501ccb272f77d725ea63a9e
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba713efd72a7e5a23af5094c271658a5e9f240179cda56d6144a6e43ec878c2
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baf527bb1b8e21c120d29d08c1f65101e6dae78a7f16b082b592f156b54cde5d
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5925b2fee006cee5ccc1d208289db4b40b34930bb14ff7f78e0519a6d71dafb5
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2a64cb51e1a2804c06ce816c5c8aaf83f6f2b8cc3fbd1a4ad85549c3c263fe5
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba94e44e7c61f5667b9542de32dde71b1e571b4291a68a5b4baa5e905aea305
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.9709543568464731,
3
- "total_flos": 2.029407915034542e+17,
4
- "train_loss": 0.3740218333899975,
5
- "train_runtime": 1097.5227,
6
- "train_samples": 1927,
7
- "train_samples_per_second": 3.512,
8
- "train_steps_per_second": 0.073
9
  }
 
1
  {
2
  "epoch": 1.9709543568464731,
3
+ "total_flos": 2.0214124294176768e+17,
4
+ "train_loss": 0.371533726900816,
5
+ "train_runtime": 1201.5473,
6
+ "train_samples": 1925,
7
+ "train_samples_per_second": 3.204,
8
+ "train_steps_per_second": 0.067
9
  }
trainer_state.json CHANGED
@@ -10,147 +10,147 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.024896265560165973,
13
- "grad_norm": 0.09802406423206611,
14
  "learning_rate": 1.25e-06,
15
- "loss": 0.4475,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.12448132780082988,
20
- "grad_norm": 0.06629266158913129,
21
  "learning_rate": 6.25e-06,
22
- "loss": 0.4034,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.24896265560165975,
27
- "grad_norm": 0.07682971594488436,
28
  "learning_rate": 9.980973490458728e-06,
29
- "loss": 0.447,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.37344398340248963,
34
- "grad_norm": 0.06952920500301711,
35
  "learning_rate": 9.768584753741134e-06,
36
- "loss": 0.4512,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.4979253112033195,
41
- "grad_norm": 0.05665883883073202,
42
  "learning_rate": 9.330127018922195e-06,
43
- "loss": 0.4115,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6224066390041494,
48
- "grad_norm": 0.05543531120968404,
49
  "learning_rate": 8.68638668405062e-06,
50
- "loss": 0.4224,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7468879668049793,
55
- "grad_norm": 0.05055338657549615,
56
  "learning_rate": 7.86788218175523e-06,
57
- "loss": 0.3645,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8713692946058091,
62
- "grad_norm": 0.04640618215423049,
63
  "learning_rate": 6.913417161825449e-06,
64
- "loss": 0.3693,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.995850622406639,
69
- "grad_norm": 0.04834379140546879,
70
  "learning_rate": 5.8682408883346535e-06,
71
- "loss": 0.3722,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
- "eval_loss": 0.4088011085987091,
77
- "eval_runtime": 28.7977,
78
- "eval_samples_per_second": 19.064,
79
- "eval_steps_per_second": 4.792,
80
  "step": 41
81
  },
82
  {
83
  "epoch": 1.099585062240664,
84
- "grad_norm": 0.0437345911401945,
85
  "learning_rate": 4.781903063173321e-06,
86
- "loss": 0.3264,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2240663900414939,
91
- "grad_norm": 0.0435343363130576,
92
  "learning_rate": 3.705904774487396e-06,
93
- "loss": 0.3432,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3485477178423237,
98
- "grad_norm": 0.038958225706696346,
99
  "learning_rate": 2.6912569338248317e-06,
100
- "loss": 0.3608,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4730290456431536,
105
- "grad_norm": 0.043127252621560926,
106
  "learning_rate": 1.7860619515673034e-06,
107
- "loss": 0.3517,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5975103734439835,
112
- "grad_norm": 0.040329157413075245,
113
  "learning_rate": 1.0332332985438248e-06,
114
- "loss": 0.3471,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.7219917012448134,
119
- "grad_norm": 0.041243372965840595,
120
  "learning_rate": 4.6846106481675035e-07,
121
- "loss": 0.3397,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8464730290456433,
126
- "grad_norm": 0.03676264140179849,
127
  "learning_rate": 1.185199644003332e-07,
128
- "loss": 0.3206,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9709543568464731,
133
- "grad_norm": 0.0398501802157299,
134
  "learning_rate": 0.0,
135
- "loss": 0.3444,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.9709543568464731,
140
- "eval_loss": 0.40992262959480286,
141
- "eval_runtime": 28.1962,
142
- "eval_samples_per_second": 19.471,
143
- "eval_steps_per_second": 4.894,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 1.9709543568464731,
148
  "step": 80,
149
- "total_flos": 2.029407915034542e+17,
150
- "train_loss": 0.3740218333899975,
151
- "train_runtime": 1097.5227,
152
- "train_samples_per_second": 3.512,
153
- "train_steps_per_second": 0.073
154
  }
155
  ],
156
  "logging_steps": 5,
@@ -170,7 +170,7 @@
170
  "attributes": {}
171
  }
172
  },
173
- "total_flos": 2.029407915034542e+17,
174
  "train_batch_size": 1,
175
  "trial_name": null,
176
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.024896265560165973,
13
+ "grad_norm": 0.11021283446475846,
14
  "learning_rate": 1.25e-06,
15
+ "loss": 0.4278,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.12448132780082988,
20
+ "grad_norm": 0.0774790715838221,
21
  "learning_rate": 6.25e-06,
22
+ "loss": 0.4499,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.24896265560165975,
27
+ "grad_norm": 0.08040067775763293,
28
  "learning_rate": 9.980973490458728e-06,
29
+ "loss": 0.4082,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.37344398340248963,
34
+ "grad_norm": 0.07045048232950048,
35
  "learning_rate": 9.768584753741134e-06,
36
+ "loss": 0.4303,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.4979253112033195,
41
+ "grad_norm": 0.05522617085677978,
42
  "learning_rate": 9.330127018922195e-06,
43
+ "loss": 0.3962,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6224066390041494,
48
+ "grad_norm": 0.04920532366631061,
49
  "learning_rate": 8.68638668405062e-06,
50
+ "loss": 0.3735,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.7468879668049793,
55
+ "grad_norm": 0.052532219713063856,
56
  "learning_rate": 7.86788218175523e-06,
57
+ "loss": 0.4016,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8713692946058091,
62
+ "grad_norm": 0.05694643147496267,
63
  "learning_rate": 6.913417161825449e-06,
64
+ "loss": 0.3946,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.995850622406639,
69
+ "grad_norm": 0.04732918589868144,
70
  "learning_rate": 5.8682408883346535e-06,
71
+ "loss": 0.37,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "eval_loss": 0.40875306725502014,
77
+ "eval_runtime": 29.4951,
78
+ "eval_samples_per_second": 18.613,
79
+ "eval_steps_per_second": 4.679,
80
  "step": 41
81
  },
82
  {
83
  "epoch": 1.099585062240664,
84
+ "grad_norm": 0.042329664384282324,
85
  "learning_rate": 4.781903063173321e-06,
86
+ "loss": 0.3745,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.2240663900414939,
91
+ "grad_norm": 0.042311381287778824,
92
  "learning_rate": 3.705904774487396e-06,
93
+ "loss": 0.3656,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3485477178423237,
98
+ "grad_norm": 0.04211186999995547,
99
  "learning_rate": 2.6912569338248317e-06,
100
+ "loss": 0.3249,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4730290456431536,
105
+ "grad_norm": 0.03859377186467179,
106
  "learning_rate": 1.7860619515673034e-06,
107
+ "loss": 0.3478,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5975103734439835,
112
+ "grad_norm": 0.038411951194646604,
113
  "learning_rate": 1.0332332985438248e-06,
114
+ "loss": 0.3441,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.7219917012448134,
119
+ "grad_norm": 0.036561598740802725,
120
  "learning_rate": 4.6846106481675035e-07,
121
+ "loss": 0.3186,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8464730290456433,
126
+ "grad_norm": 0.045523136033542085,
127
  "learning_rate": 1.185199644003332e-07,
128
+ "loss": 0.3124,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9709543568464731,
133
+ "grad_norm": 0.040621614883517274,
134
  "learning_rate": 0.0,
135
+ "loss": 0.3367,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.9709543568464731,
140
+ "eval_loss": 0.4118167757987976,
141
+ "eval_runtime": 28.4832,
142
+ "eval_samples_per_second": 19.275,
143
+ "eval_steps_per_second": 4.845,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 1.9709543568464731,
148
  "step": 80,
149
+ "total_flos": 2.0214124294176768e+17,
150
+ "train_loss": 0.371533726900816,
151
+ "train_runtime": 1201.5473,
152
+ "train_samples_per_second": 3.204,
153
+ "train_steps_per_second": 0.067
154
  }
155
  ],
156
  "logging_steps": 5,
 
170
  "attributes": {}
171
  }
172
  },
173
+ "total_flos": 2.0214124294176768e+17,
174
  "train_batch_size": 1,
175
  "trial_name": null,
176
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa7e3d947f8f4edfc3f04cb89caba20dd5a6e20c1cb687a46d40fa5f37ff6853
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5091113cdf164b227f0dadd0fa30c76a926f363fc1d6b142a3bca31ae85cc04f
3
  size 7352