LamaDiab commited on
Commit
abcffeb
·
verified ·
1 Parent(s): 5ce6c71

Training checkpoint - Epoch 1, Step 8592

Browse files
checkpoint-8592/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89a61c487beb363b9f2ba06b81ddc3e53d6b0a2c639ee23e6952bd7780510834
3
  size 90864192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d1a4111577ef4d291e80e5292f0f3c219b62e62ef9abecf13ac1d65f8f5dc4f
3
  size 90864192
checkpoint-8592/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ed2077cd1424a4bd6263b74c29ad763d99139dddac5cda268504b7e6a29c62a
3
  size 180607738
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a100d193183d7e7b094edbc47b60e62ce79dd9f643b10003c192501c3c5c5b4
3
  size 180607738
checkpoint-8592/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed8957d8449ae46b38d8fe19fffa1ae3f7b3df2da18808afae432603b2da24ba
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc2590fb0ec9cda487c083dd0ef28aca961dbe4136bc2c4ee0e715bcfbe0d3e7
3
  size 14244
checkpoint-8592/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8af7a0439d0c5b73969913ee5bf0dce610cfc36e12df7e585d423d497b4b4781
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b53a8e07ee4daabf485fd62b89cf48e6127315454b1d158f92cddba68d6165d
3
  size 988
checkpoint-8592/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a793324604a443d51136ef867b5388749921f4ed6b25fd241c495cebbb15b87c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a22a0371420d37f26bb457170020470f80c985620cede98e6725eac95d9b56d3
3
  size 1064
checkpoint-8592/trainer_state.json CHANGED
@@ -11,281 +11,281 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.00011638733705772812,
14
- "grad_norm": Infinity,
15
  "learning_rate": 0.0,
16
- "loss": 9.4531,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.05819366852886406,
21
- "grad_norm": 6.4754815101623535,
22
- "learning_rate": 1.7390292166220463e-06,
23
- "loss": 6.7666,
24
  "step": 500
25
  },
26
  {
27
  "epoch": 0.05819366852886406,
28
- "eval_cosine_accuracy": 0.9417916536331177,
29
- "eval_loss": 4.736684322357178,
30
- "eval_runtime": 26.7399,
31
- "eval_samples_per_second": 354.003,
32
- "eval_steps_per_second": 1.384,
33
  "step": 500
34
  },
35
  {
36
  "epoch": 0.11638733705772812,
37
- "grad_norm": 5.042841911315918,
38
- "learning_rate": 3.4850424863228964e-06,
39
- "loss": 4.0612,
40
  "step": 1000
41
  },
42
  {
43
  "epoch": 0.11638733705772812,
44
- "eval_cosine_accuracy": 0.9421085715293884,
45
- "eval_loss": 4.456507682800293,
46
- "eval_runtime": 27.0721,
47
- "eval_samples_per_second": 349.659,
48
- "eval_steps_per_second": 1.367,
49
  "step": 1000
50
  },
51
  {
52
  "epoch": 0.17458100558659218,
53
- "grad_norm": 4.396075248718262,
54
- "learning_rate": 5.2310557560237455e-06,
55
- "loss": 3.8306,
56
  "step": 1500
57
  },
58
  {
59
  "epoch": 0.17458100558659218,
60
- "eval_cosine_accuracy": 0.9389393329620361,
61
- "eval_loss": 4.417118072509766,
62
- "eval_runtime": 29.5299,
63
- "eval_samples_per_second": 320.556,
64
- "eval_steps_per_second": 1.253,
65
  "step": 1500
66
  },
67
  {
68
  "epoch": 0.23277467411545624,
69
- "grad_norm": 4.9983229637146,
70
- "learning_rate": 6.9770690257245955e-06,
71
- "loss": 3.7166,
72
  "step": 2000
73
  },
74
  {
75
  "epoch": 0.23277467411545624,
76
- "eval_cosine_accuracy": 0.9376716613769531,
77
- "eval_loss": 4.384617805480957,
78
- "eval_runtime": 27.1288,
79
- "eval_samples_per_second": 348.928,
80
- "eval_steps_per_second": 1.364,
81
  "step": 2000
82
  },
83
  {
84
  "epoch": 0.2909683426443203,
85
- "grad_norm": 5.1397552490234375,
86
- "learning_rate": 8.723082295425445e-06,
87
- "loss": 3.5906,
88
  "step": 2500
89
  },
90
  {
91
  "epoch": 0.2909683426443203,
92
- "eval_cosine_accuracy": 0.929537296295166,
93
- "eval_loss": 4.382342338562012,
94
- "eval_runtime": 27.2377,
95
- "eval_samples_per_second": 347.534,
96
- "eval_steps_per_second": 1.358,
97
  "step": 2500
98
  },
99
  {
100
  "epoch": 0.34916201117318435,
101
- "grad_norm": 5.401367664337158,
102
- "learning_rate": 1.0469095565126295e-05,
103
- "loss": 3.504,
104
  "step": 3000
105
  },
106
  {
107
  "epoch": 0.34916201117318435,
108
- "eval_cosine_accuracy": 0.9236213564872742,
109
- "eval_loss": 4.398530006408691,
110
- "eval_runtime": 27.2096,
111
- "eval_samples_per_second": 347.891,
112
- "eval_steps_per_second": 1.36,
113
  "step": 3000
114
  },
115
  {
116
  "epoch": 0.4073556797020484,
117
- "grad_norm": 5.506592273712158,
118
- "learning_rate": 1.2215108834827144e-05,
119
- "loss": 3.4064,
120
  "step": 3500
121
  },
122
  {
123
  "epoch": 0.4073556797020484,
124
- "eval_cosine_accuracy": 0.9207690954208374,
125
- "eval_loss": 4.428633689880371,
126
- "eval_runtime": 27.2549,
127
- "eval_samples_per_second": 347.314,
128
- "eval_steps_per_second": 1.358,
129
  "step": 3500
130
  },
131
  {
132
  "epoch": 0.4655493482309125,
133
- "grad_norm": 6.144413471221924,
134
- "learning_rate": 1.3961122104527995e-05,
135
- "loss": 3.3219,
136
  "step": 4000
137
  },
138
  {
139
  "epoch": 0.4655493482309125,
140
- "eval_cosine_accuracy": 0.9240439534187317,
141
- "eval_loss": 4.413504600524902,
142
- "eval_runtime": 27.1733,
143
- "eval_samples_per_second": 348.357,
144
- "eval_steps_per_second": 1.362,
145
  "step": 4000
146
  },
147
  {
148
  "epoch": 0.5237430167597765,
149
- "grad_norm": 6.7739691734313965,
150
- "learning_rate": 1.570364334768944e-05,
151
- "loss": 3.2308,
152
  "step": 4500
153
  },
154
  {
155
  "epoch": 0.5237430167597765,
156
- "eval_cosine_accuracy": 0.9183393120765686,
157
- "eval_loss": 4.584815979003906,
158
- "eval_runtime": 27.4916,
159
- "eval_samples_per_second": 344.324,
160
- "eval_steps_per_second": 1.346,
161
  "step": 4500
162
  },
163
  {
164
  "epoch": 0.5819366852886406,
165
- "grad_norm": 7.5280914306640625,
166
- "learning_rate": 1.7449656617390294e-05,
167
- "loss": 3.1167,
168
  "step": 5000
169
  },
170
  {
171
  "epoch": 0.5819366852886406,
172
- "eval_cosine_accuracy": 0.9231988191604614,
173
- "eval_loss": 4.731673240661621,
174
- "eval_runtime": 27.9733,
175
- "eval_samples_per_second": 338.394,
176
- "eval_steps_per_second": 1.323,
177
  "step": 5000
178
  },
179
  {
180
  "epoch": 0.6401303538175046,
181
- "grad_norm": 8.379385948181152,
182
- "learning_rate": 1.919217786055174e-05,
183
- "loss": 3.0155,
184
  "step": 5500
185
  },
186
  {
187
  "epoch": 0.6401303538175046,
188
- "eval_cosine_accuracy": 0.9202408790588379,
189
- "eval_loss": 4.590743064880371,
190
- "eval_runtime": 30.4387,
191
- "eval_samples_per_second": 310.985,
192
- "eval_steps_per_second": 1.216,
193
  "step": 5500
194
  },
195
  {
196
  "epoch": 0.6983240223463687,
197
- "grad_norm": 7.9107985496521,
198
- "learning_rate": 2.093819113025259e-05,
199
- "loss": 2.928,
200
  "step": 6000
201
  },
202
  {
203
  "epoch": 0.6983240223463687,
204
- "eval_cosine_accuracy": 0.9188675284385681,
205
- "eval_loss": 4.662985324859619,
206
- "eval_runtime": 27.6404,
207
- "eval_samples_per_second": 342.469,
208
- "eval_steps_per_second": 1.339,
209
  "step": 6000
210
  },
211
  {
212
  "epoch": 0.7565176908752328,
213
- "grad_norm": 8.108757972717285,
214
- "learning_rate": 2.268420439995344e-05,
215
- "loss": 2.8455,
216
  "step": 6500
217
  },
218
  {
219
  "epoch": 0.7565176908752328,
220
- "eval_cosine_accuracy": 0.9199239611625671,
221
- "eval_loss": 4.784646511077881,
222
- "eval_runtime": 27.3827,
223
- "eval_samples_per_second": 345.693,
224
- "eval_steps_per_second": 1.351,
225
  "step": 6500
226
  },
227
  {
228
  "epoch": 0.8147113594040968,
229
- "grad_norm": 8.092316627502441,
230
- "learning_rate": 2.443021766965429e-05,
231
- "loss": 2.7847,
232
  "step": 7000
233
  },
234
  {
235
  "epoch": 0.8147113594040968,
236
- "eval_cosine_accuracy": 0.9054510593414307,
237
- "eval_loss": 4.925645351409912,
238
- "eval_runtime": 27.6051,
239
- "eval_samples_per_second": 342.908,
240
- "eval_steps_per_second": 1.34,
241
  "step": 7000
242
  },
243
  {
244
  "epoch": 0.8729050279329609,
245
- "grad_norm": 8.676020622253418,
246
- "learning_rate": 2.617623093935514e-05,
247
- "loss": 2.7228,
248
  "step": 7500
249
  },
250
  {
251
  "epoch": 0.8729050279329609,
252
- "eval_cosine_accuracy": 0.9056623578071594,
253
- "eval_loss": 4.843690395355225,
254
- "eval_runtime": 27.3545,
255
- "eval_samples_per_second": 346.049,
256
- "eval_steps_per_second": 1.353,
257
  "step": 7500
258
  },
259
  {
260
  "epoch": 0.931098696461825,
261
- "grad_norm": 9.083893775939941,
262
- "learning_rate": 2.791875218251659e-05,
263
- "loss": 2.6738,
264
  "step": 8000
265
  },
266
  {
267
  "epoch": 0.931098696461825,
268
- "eval_cosine_accuracy": 0.907247006893158,
269
- "eval_loss": 4.833749771118164,
270
- "eval_runtime": 27.6952,
271
- "eval_samples_per_second": 341.791,
272
- "eval_steps_per_second": 1.336,
273
  "step": 8000
274
  },
275
  {
276
  "epoch": 0.9892923649906891,
277
- "grad_norm": 7.430295944213867,
278
- "learning_rate": 2.9664765452217438e-05,
279
- "loss": 2.6174,
280
  "step": 8500
281
  },
282
  {
283
  "epoch": 0.9892923649906891,
284
- "eval_cosine_accuracy": 0.9006972312927246,
285
- "eval_loss": 4.809013366699219,
286
- "eval_runtime": 27.7002,
287
- "eval_samples_per_second": 341.731,
288
- "eval_steps_per_second": 1.336,
289
  "step": 8500
290
  }
291
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.00011638733705772812,
14
+ "grad_norm": 8.116299629211426,
15
  "learning_rate": 0.0,
16
+ "loss": 2.7266,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.05819366852886406,
21
+ "grad_norm": 57.76313781738281,
22
+ "learning_rate": 1.732045163543243e-06,
23
+ "loss": 3.7373,
24
  "step": 500
25
  },
26
  {
27
  "epoch": 0.05819366852886406,
28
+ "eval_cosine_accuracy": 0.9404736757278442,
29
+ "eval_loss": 2.536726713180542,
30
+ "eval_runtime": 26.2292,
31
+ "eval_samples_per_second": 360.59,
32
+ "eval_steps_per_second": 1.411,
33
  "step": 500
34
  },
35
  {
36
  "epoch": 0.11638733705772812,
37
+ "grad_norm": 5.569820880889893,
38
+ "learning_rate": 3.4780584332440925e-06,
39
+ "loss": 2.8655,
40
  "step": 1000
41
  },
42
  {
43
  "epoch": 0.11638733705772812,
44
+ "eval_cosine_accuracy": 0.9364559054374695,
45
+ "eval_loss": 2.6046745777130127,
46
+ "eval_runtime": 26.7465,
47
+ "eval_samples_per_second": 353.617,
48
+ "eval_steps_per_second": 1.383,
49
  "step": 1000
50
  },
51
  {
52
  "epoch": 0.17458100558659218,
53
+ "grad_norm": 5.892590045928955,
54
+ "learning_rate": 5.224071702944943e-06,
55
+ "loss": 2.3859,
56
  "step": 1500
57
  },
58
  {
59
  "epoch": 0.17458100558659218,
60
+ "eval_cosine_accuracy": 0.9412137866020203,
61
+ "eval_loss": 2.540637969970703,
62
+ "eval_runtime": 26.8122,
63
+ "eval_samples_per_second": 352.75,
64
+ "eval_steps_per_second": 1.38,
65
  "step": 1500
66
  },
67
  {
68
  "epoch": 0.23277467411545624,
69
+ "grad_norm": 41.33493423461914,
70
+ "learning_rate": 6.970084972645793e-06,
71
+ "loss": 2.1884,
72
  "step": 2000
73
  },
74
  {
75
  "epoch": 0.23277467411545624,
76
+ "eval_cosine_accuracy": 0.9451258182525635,
77
+ "eval_loss": 2.5318424701690674,
78
+ "eval_runtime": 26.0188,
79
+ "eval_samples_per_second": 363.506,
80
+ "eval_steps_per_second": 1.422,
81
  "step": 2000
82
  },
83
  {
84
  "epoch": 0.2909683426443203,
85
+ "grad_norm": 15.76939582824707,
86
+ "learning_rate": 8.71260621580724e-06,
87
+ "loss": 1.9576,
88
  "step": 2500
89
  },
90
  {
91
  "epoch": 0.2909683426443203,
92
+ "eval_cosine_accuracy": 0.9467117786407471,
93
+ "eval_loss": 2.474266767501831,
94
+ "eval_runtime": 26.2939,
95
+ "eval_samples_per_second": 359.703,
96
+ "eval_steps_per_second": 1.407,
97
  "step": 2500
98
  },
99
  {
100
  "epoch": 0.34916201117318435,
101
+ "grad_norm": 7.4116950035095215,
102
+ "learning_rate": 1.045861948550809e-05,
103
+ "loss": 1.8211,
104
  "step": 3000
105
  },
106
  {
107
  "epoch": 0.34916201117318435,
108
+ "eval_cosine_accuracy": 0.9435398578643799,
109
+ "eval_loss": 2.547072649002075,
110
+ "eval_runtime": 26.1008,
111
+ "eval_samples_per_second": 362.365,
112
+ "eval_steps_per_second": 1.418,
113
  "step": 3000
114
  },
115
  {
116
  "epoch": 0.4073556797020484,
117
+ "grad_norm": 25.99747657775879,
118
+ "learning_rate": 1.2204632755208939e-05,
119
+ "loss": 1.6603,
120
  "step": 3500
121
  },
122
  {
123
  "epoch": 0.4073556797020484,
124
+ "eval_cosine_accuracy": 0.9489321112632751,
125
+ "eval_loss": 2.472174644470215,
126
+ "eval_runtime": 27.1444,
127
+ "eval_samples_per_second": 348.433,
128
+ "eval_steps_per_second": 1.363,
129
  "step": 3500
130
  },
131
  {
132
  "epoch": 0.4655493482309125,
133
+ "grad_norm": 5.763104438781738,
134
+ "learning_rate": 1.395064602490979e-05,
135
+ "loss": 1.596,
136
  "step": 4000
137
  },
138
  {
139
  "epoch": 0.4655493482309125,
140
+ "eval_cosine_accuracy": 0.9438570737838745,
141
+ "eval_loss": 2.5426251888275146,
142
+ "eval_runtime": 28.2718,
143
+ "eval_samples_per_second": 334.538,
144
+ "eval_steps_per_second": 1.309,
145
  "step": 4000
146
  },
147
  {
148
  "epoch": 0.5237430167597765,
149
+ "grad_norm": 4.893315315246582,
150
+ "learning_rate": 1.5693167268071237e-05,
151
+ "loss": 1.5379,
152
  "step": 4500
153
  },
154
  {
155
  "epoch": 0.5237430167597765,
156
+ "eval_cosine_accuracy": 0.9473461508750916,
157
+ "eval_loss": 2.4768149852752686,
158
+ "eval_runtime": 28.2378,
159
+ "eval_samples_per_second": 334.941,
160
+ "eval_steps_per_second": 1.31,
161
  "step": 4500
162
  },
163
  {
164
  "epoch": 0.5819366852886406,
165
+ "grad_norm": 43.368614196777344,
166
+ "learning_rate": 1.7439180537772086e-05,
167
+ "loss": 1.5397,
168
  "step": 5000
169
  },
170
  {
171
  "epoch": 0.5819366852886406,
172
+ "eval_cosine_accuracy": 0.9487206339836121,
173
+ "eval_loss": 2.4771170616149902,
174
+ "eval_runtime": 28.4116,
175
+ "eval_samples_per_second": 332.892,
176
+ "eval_steps_per_second": 1.302,
177
  "step": 5000
178
  },
179
  {
180
  "epoch": 0.6401303538175046,
181
+ "grad_norm": 0.21546457707881927,
182
+ "learning_rate": 1.9185193807472936e-05,
183
+ "loss": 1.381,
184
  "step": 5500
185
  },
186
  {
187
  "epoch": 0.6401303538175046,
188
+ "eval_cosine_accuracy": 0.9412137866020203,
189
+ "eval_loss": 2.6126925945281982,
190
+ "eval_runtime": 28.5708,
191
+ "eval_samples_per_second": 331.037,
192
+ "eval_steps_per_second": 1.295,
193
  "step": 5500
194
  },
195
  {
196
  "epoch": 0.6983240223463687,
197
+ "grad_norm": 27.70214080810547,
198
+ "learning_rate": 2.0931207077173788e-05,
199
+ "loss": 1.4407,
200
  "step": 6000
201
  },
202
  {
203
  "epoch": 0.6983240223463687,
204
+ "eval_cosine_accuracy": 0.9492493271827698,
205
+ "eval_loss": 2.457711935043335,
206
+ "eval_runtime": 28.3495,
207
+ "eval_samples_per_second": 333.621,
208
+ "eval_steps_per_second": 1.305,
209
  "step": 6000
210
  },
211
  {
212
  "epoch": 0.7565176908752328,
213
+ "grad_norm": 0.6600456237792969,
214
+ "learning_rate": 2.2677220346874637e-05,
215
+ "loss": 1.3692,
216
  "step": 6500
217
  },
218
  {
219
  "epoch": 0.7565176908752328,
220
+ "eval_cosine_accuracy": 0.9438570737838745,
221
+ "eval_loss": 2.48168683052063,
222
+ "eval_runtime": 28.3758,
223
+ "eval_samples_per_second": 333.313,
224
+ "eval_steps_per_second": 1.304,
225
  "step": 6500
226
  },
227
  {
228
  "epoch": 0.8147113594040968,
229
+ "grad_norm": 15.486236572265625,
230
+ "learning_rate": 2.4419741590036085e-05,
231
+ "loss": 1.2731,
232
  "step": 7000
233
  },
234
  {
235
  "epoch": 0.8147113594040968,
236
+ "eval_cosine_accuracy": 0.943751335144043,
237
+ "eval_loss": 2.5139832496643066,
238
+ "eval_runtime": 28.0964,
239
+ "eval_samples_per_second": 336.627,
240
+ "eval_steps_per_second": 1.317,
241
  "step": 7000
242
  },
243
  {
244
  "epoch": 0.8729050279329609,
245
+ "grad_norm": 13.570350646972656,
246
+ "learning_rate": 2.6165754859736934e-05,
247
+ "loss": 1.223,
248
  "step": 7500
249
  },
250
  {
251
  "epoch": 0.8729050279329609,
252
+ "eval_cosine_accuracy": 0.9494607448577881,
253
+ "eval_loss": 2.4431588649749756,
254
+ "eval_runtime": 28.2933,
255
+ "eval_samples_per_second": 334.285,
256
+ "eval_steps_per_second": 1.308,
257
  "step": 7500
258
  },
259
  {
260
  "epoch": 0.931098696461825,
261
+ "grad_norm": 4.8542633056640625,
262
+ "learning_rate": 2.7911768129437783e-05,
263
+ "loss": 1.1982,
264
  "step": 8000
265
  },
266
  {
267
  "epoch": 0.931098696461825,
268
+ "eval_cosine_accuracy": 0.9420596361160278,
269
+ "eval_loss": 2.5187907218933105,
270
+ "eval_runtime": 28.0669,
271
+ "eval_samples_per_second": 336.98,
272
+ "eval_steps_per_second": 1.318,
273
  "step": 8000
274
  },
275
  {
276
  "epoch": 0.9892923649906891,
277
+ "grad_norm": 36.649906158447266,
278
+ "learning_rate": 2.9657781399138632e-05,
279
+ "loss": 1.1693,
280
  "step": 8500
281
  },
282
  {
283
  "epoch": 0.9892923649906891,
284
+ "eval_cosine_accuracy": 0.9469232559204102,
285
+ "eval_loss": 2.4668424129486084,
286
+ "eval_runtime": 28.3845,
287
+ "eval_samples_per_second": 333.21,
288
+ "eval_steps_per_second": 1.304,
289
  "step": 8500
290
  }
291
  ],
checkpoint-8592/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2efd1e792aece673d4f9c02b27251769a72b304f7949f137f2a4d2582ff9c2ea
3
  size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb5dbf286f9bcff890fd932c91238ff7e047b1e1b1921ecabbd4cf0142ff6086
3
  size 5688
checkpoint-8592/training_metrics.json CHANGED
@@ -3,104 +3,104 @@
3
  {
4
  "epoch": 0.05819366852886406,
5
  "global_step": 500,
6
- "eval_loss": 4.736684322357178,
7
- "eval_cosine_accuracy": 0.9417916536331177
8
  },
9
  {
10
  "epoch": 0.11638733705772812,
11
  "global_step": 1000,
12
- "eval_loss": 4.456507682800293,
13
- "eval_cosine_accuracy": 0.9421085715293884
14
  },
15
  {
16
  "epoch": 0.17458100558659218,
17
  "global_step": 1500,
18
- "eval_loss": 4.417118072509766,
19
- "eval_cosine_accuracy": 0.9389393329620361
20
  },
21
  {
22
  "epoch": 0.23277467411545624,
23
  "global_step": 2000,
24
- "eval_loss": 4.384617805480957,
25
- "eval_cosine_accuracy": 0.9376716613769531
26
  },
27
  {
28
  "epoch": 0.2909683426443203,
29
  "global_step": 2500,
30
- "eval_loss": 4.382342338562012,
31
- "eval_cosine_accuracy": 0.929537296295166
32
  },
33
  {
34
  "epoch": 0.34916201117318435,
35
  "global_step": 3000,
36
- "eval_loss": 4.398530006408691,
37
- "eval_cosine_accuracy": 0.9236213564872742
38
  },
39
  {
40
  "epoch": 0.4073556797020484,
41
  "global_step": 3500,
42
- "eval_loss": 4.428633689880371,
43
- "eval_cosine_accuracy": 0.9207690954208374
44
  },
45
  {
46
  "epoch": 0.4655493482309125,
47
  "global_step": 4000,
48
- "eval_loss": 4.413504600524902,
49
- "eval_cosine_accuracy": 0.9240439534187317
50
  },
51
  {
52
  "epoch": 0.5237430167597765,
53
  "global_step": 4500,
54
- "eval_loss": 4.584815979003906,
55
- "eval_cosine_accuracy": 0.9183393120765686
56
  },
57
  {
58
  "epoch": 0.5819366852886406,
59
  "global_step": 5000,
60
- "eval_loss": 4.731673240661621,
61
- "eval_cosine_accuracy": 0.9231988191604614
62
  },
63
  {
64
  "epoch": 0.6401303538175046,
65
  "global_step": 5500,
66
- "eval_loss": 4.590743064880371,
67
- "eval_cosine_accuracy": 0.9202408790588379
68
  },
69
  {
70
  "epoch": 0.6983240223463687,
71
  "global_step": 6000,
72
- "eval_loss": 4.662985324859619,
73
- "eval_cosine_accuracy": 0.9188675284385681
74
  },
75
  {
76
  "epoch": 0.7565176908752328,
77
  "global_step": 6500,
78
- "eval_loss": 4.784646511077881,
79
- "eval_cosine_accuracy": 0.9199239611625671
80
  },
81
  {
82
  "epoch": 0.8147113594040968,
83
  "global_step": 7000,
84
- "eval_loss": 4.925645351409912,
85
- "eval_cosine_accuracy": 0.9054510593414307
86
  },
87
  {
88
  "epoch": 0.8729050279329609,
89
  "global_step": 7500,
90
- "eval_loss": 4.843690395355225,
91
- "eval_cosine_accuracy": 0.9056623578071594
92
  },
93
  {
94
  "epoch": 0.931098696461825,
95
  "global_step": 8000,
96
- "eval_loss": 4.833749771118164,
97
- "eval_cosine_accuracy": 0.907247006893158
98
  },
99
  {
100
  "epoch": 0.9892923649906891,
101
  "global_step": 8500,
102
- "eval_loss": 4.809013366699219,
103
- "eval_cosine_accuracy": 0.9006972312927246
104
  }
105
  ],
106
  "current_epoch": 1,
 
3
  {
4
  "epoch": 0.05819366852886406,
5
  "global_step": 500,
6
+ "eval_loss": 2.536726713180542,
7
+ "eval_cosine_accuracy": 0.9404736757278442
8
  },
9
  {
10
  "epoch": 0.11638733705772812,
11
  "global_step": 1000,
12
+ "eval_loss": 2.6046745777130127,
13
+ "eval_cosine_accuracy": 0.9364559054374695
14
  },
15
  {
16
  "epoch": 0.17458100558659218,
17
  "global_step": 1500,
18
+ "eval_loss": 2.540637969970703,
19
+ "eval_cosine_accuracy": 0.9412137866020203
20
  },
21
  {
22
  "epoch": 0.23277467411545624,
23
  "global_step": 2000,
24
+ "eval_loss": 2.5318424701690674,
25
+ "eval_cosine_accuracy": 0.9451258182525635
26
  },
27
  {
28
  "epoch": 0.2909683426443203,
29
  "global_step": 2500,
30
+ "eval_loss": 2.474266767501831,
31
+ "eval_cosine_accuracy": 0.9467117786407471
32
  },
33
  {
34
  "epoch": 0.34916201117318435,
35
  "global_step": 3000,
36
+ "eval_loss": 2.547072649002075,
37
+ "eval_cosine_accuracy": 0.9435398578643799
38
  },
39
  {
40
  "epoch": 0.4073556797020484,
41
  "global_step": 3500,
42
+ "eval_loss": 2.472174644470215,
43
+ "eval_cosine_accuracy": 0.9489321112632751
44
  },
45
  {
46
  "epoch": 0.4655493482309125,
47
  "global_step": 4000,
48
+ "eval_loss": 2.5426251888275146,
49
+ "eval_cosine_accuracy": 0.9438570737838745
50
  },
51
  {
52
  "epoch": 0.5237430167597765,
53
  "global_step": 4500,
54
+ "eval_loss": 2.4768149852752686,
55
+ "eval_cosine_accuracy": 0.9473461508750916
56
  },
57
  {
58
  "epoch": 0.5819366852886406,
59
  "global_step": 5000,
60
+ "eval_loss": 2.4771170616149902,
61
+ "eval_cosine_accuracy": 0.9487206339836121
62
  },
63
  {
64
  "epoch": 0.6401303538175046,
65
  "global_step": 5500,
66
+ "eval_loss": 2.6126925945281982,
67
+ "eval_cosine_accuracy": 0.9412137866020203
68
  },
69
  {
70
  "epoch": 0.6983240223463687,
71
  "global_step": 6000,
72
+ "eval_loss": 2.457711935043335,
73
+ "eval_cosine_accuracy": 0.9492493271827698
74
  },
75
  {
76
  "epoch": 0.7565176908752328,
77
  "global_step": 6500,
78
+ "eval_loss": 2.48168683052063,
79
+ "eval_cosine_accuracy": 0.9438570737838745
80
  },
81
  {
82
  "epoch": 0.8147113594040968,
83
  "global_step": 7000,
84
+ "eval_loss": 2.5139832496643066,
85
+ "eval_cosine_accuracy": 0.943751335144043
86
  },
87
  {
88
  "epoch": 0.8729050279329609,
89
  "global_step": 7500,
90
+ "eval_loss": 2.4431588649749756,
91
+ "eval_cosine_accuracy": 0.9494607448577881
92
  },
93
  {
94
  "epoch": 0.931098696461825,
95
  "global_step": 8000,
96
+ "eval_loss": 2.5187907218933105,
97
+ "eval_cosine_accuracy": 0.9420596361160278
98
  },
99
  {
100
  "epoch": 0.9892923649906891,
101
  "global_step": 8500,
102
+ "eval_loss": 2.4668424129486084,
103
+ "eval_cosine_accuracy": 0.9469232559204102
104
  }
105
  ],
106
  "current_epoch": 1,