File size: 9,117 Bytes
c568859
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9994107248084856,
  "eval_steps": 500,
  "global_step": 1272,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.019642506383814574,
      "grad_norm": 2.737959623336792,
      "learning_rate": 3.90625e-05,
      "loss": 1.8852,
      "step": 25
    },
    {
      "epoch": 0.03928501276762915,
      "grad_norm": 0.8758559823036194,
      "learning_rate": 7.8125e-05,
      "loss": 1.4536,
      "step": 50
    },
    {
      "epoch": 0.05892751915144372,
      "grad_norm": 0.5361883640289307,
      "learning_rate": 0.00011718750000000001,
      "loss": 1.16,
      "step": 75
    },
    {
      "epoch": 0.0785700255352583,
      "grad_norm": 0.5839149951934814,
      "learning_rate": 0.00015625,
      "loss": 1.01,
      "step": 100
    },
    {
      "epoch": 0.09821253191907288,
      "grad_norm": 0.6101933121681213,
      "learning_rate": 0.0001953125,
      "loss": 0.9627,
      "step": 125
    },
    {
      "epoch": 0.11785503830288745,
      "grad_norm": 0.6417807936668396,
      "learning_rate": 0.00019615384615384615,
      "loss": 0.9149,
      "step": 150
    },
    {
      "epoch": 0.13749754468670203,
      "grad_norm": 0.549029529094696,
      "learning_rate": 0.0001917832167832168,
      "loss": 0.8981,
      "step": 175
    },
    {
      "epoch": 0.1571400510705166,
      "grad_norm": 0.6425251364707947,
      "learning_rate": 0.00018741258741258743,
      "loss": 0.8879,
      "step": 200
    },
    {
      "epoch": 0.17678255745433116,
      "grad_norm": 0.5528634786605835,
      "learning_rate": 0.00018304195804195805,
      "loss": 0.8691,
      "step": 225
    },
    {
      "epoch": 0.19642506383814576,
      "grad_norm": 0.6609376668930054,
      "learning_rate": 0.00017867132867132866,
      "loss": 0.8617,
      "step": 250
    },
    {
      "epoch": 0.21606757022196033,
      "grad_norm": 0.5458253622055054,
      "learning_rate": 0.0001743006993006993,
      "loss": 0.8572,
      "step": 275
    },
    {
      "epoch": 0.2357100766057749,
      "grad_norm": 0.6044121384620667,
      "learning_rate": 0.00016993006993006995,
      "loss": 0.8494,
      "step": 300
    },
    {
      "epoch": 0.25535258298958946,
      "grad_norm": 0.5752493739128113,
      "learning_rate": 0.00016555944055944056,
      "loss": 0.8381,
      "step": 325
    },
    {
      "epoch": 0.27499508937340406,
      "grad_norm": 0.5365332961082458,
      "learning_rate": 0.0001611888111888112,
      "loss": 0.8516,
      "step": 350
    },
    {
      "epoch": 0.2946375957572186,
      "grad_norm": 0.7016746997833252,
      "learning_rate": 0.00015681818181818182,
      "loss": 0.8359,
      "step": 375
    },
    {
      "epoch": 0.3142801021410332,
      "grad_norm": 0.6072686910629272,
      "learning_rate": 0.00015244755244755244,
      "loss": 0.8178,
      "step": 400
    },
    {
      "epoch": 0.3339226085248478,
      "grad_norm": 0.5570734739303589,
      "learning_rate": 0.00014807692307692308,
      "loss": 0.8127,
      "step": 425
    },
    {
      "epoch": 0.3535651149086623,
      "grad_norm": 0.5914424657821655,
      "learning_rate": 0.00014370629370629372,
      "loss": 0.8128,
      "step": 450
    },
    {
      "epoch": 0.3732076212924769,
      "grad_norm": 0.5375176072120667,
      "learning_rate": 0.00013933566433566434,
      "loss": 0.7828,
      "step": 475
    },
    {
      "epoch": 0.3928501276762915,
      "grad_norm": 0.5489270091056824,
      "learning_rate": 0.00013496503496503496,
      "loss": 0.8109,
      "step": 500
    },
    {
      "epoch": 0.41249263406010606,
      "grad_norm": 0.5411733984947205,
      "learning_rate": 0.0001305944055944056,
      "loss": 0.7862,
      "step": 525
    },
    {
      "epoch": 0.43213514044392065,
      "grad_norm": 0.5604883432388306,
      "learning_rate": 0.00012622377622377624,
      "loss": 0.8028,
      "step": 550
    },
    {
      "epoch": 0.45177764682773525,
      "grad_norm": 0.6268212199211121,
      "learning_rate": 0.00012185314685314686,
      "loss": 0.7969,
      "step": 575
    },
    {
      "epoch": 0.4714201532115498,
      "grad_norm": 0.5777909755706787,
      "learning_rate": 0.00011748251748251749,
      "loss": 0.7803,
      "step": 600
    },
    {
      "epoch": 0.4910626595953644,
      "grad_norm": 0.5517834424972534,
      "learning_rate": 0.0001131118881118881,
      "loss": 0.8052,
      "step": 625
    },
    {
      "epoch": 0.5107051659791789,
      "grad_norm": 0.5613248944282532,
      "learning_rate": 0.00010874125874125876,
      "loss": 0.7731,
      "step": 650
    },
    {
      "epoch": 0.5303476723629935,
      "grad_norm": 0.5555421113967896,
      "learning_rate": 0.00010437062937062938,
      "loss": 0.7959,
      "step": 675
    },
    {
      "epoch": 0.5499901787468081,
      "grad_norm": 0.5249913334846497,
      "learning_rate": 0.0001,
      "loss": 0.8082,
      "step": 700
    },
    {
      "epoch": 0.5696326851306227,
      "grad_norm": 0.578350841999054,
      "learning_rate": 9.562937062937063e-05,
      "loss": 0.7996,
      "step": 725
    },
    {
      "epoch": 0.5892751915144372,
      "grad_norm": 0.5972084403038025,
      "learning_rate": 9.125874125874126e-05,
      "loss": 0.7892,
      "step": 750
    },
    {
      "epoch": 0.6089176978982518,
      "grad_norm": 0.5550151467323303,
      "learning_rate": 8.688811188811189e-05,
      "loss": 0.7544,
      "step": 775
    },
    {
      "epoch": 0.6285602042820664,
      "grad_norm": 0.5595849752426147,
      "learning_rate": 8.251748251748252e-05,
      "loss": 0.7917,
      "step": 800
    },
    {
      "epoch": 0.648202710665881,
      "grad_norm": 0.5400447249412537,
      "learning_rate": 7.814685314685315e-05,
      "loss": 0.7429,
      "step": 825
    },
    {
      "epoch": 0.6678452170496956,
      "grad_norm": 0.5469474196434021,
      "learning_rate": 7.377622377622378e-05,
      "loss": 0.7858,
      "step": 850
    },
    {
      "epoch": 0.6874877234335102,
      "grad_norm": 0.5074354410171509,
      "learning_rate": 6.940559440559441e-05,
      "loss": 0.7378,
      "step": 875
    },
    {
      "epoch": 0.7071302298173247,
      "grad_norm": 0.5348958373069763,
      "learning_rate": 6.503496503496504e-05,
      "loss": 0.7742,
      "step": 900
    },
    {
      "epoch": 0.7267727362011392,
      "grad_norm": 0.5498335957527161,
      "learning_rate": 6.066433566433567e-05,
      "loss": 0.7922,
      "step": 925
    },
    {
      "epoch": 0.7464152425849538,
      "grad_norm": 0.5797409415245056,
      "learning_rate": 5.629370629370629e-05,
      "loss": 0.7567,
      "step": 950
    },
    {
      "epoch": 0.7660577489687684,
      "grad_norm": 0.5608484745025635,
      "learning_rate": 5.192307692307693e-05,
      "loss": 0.7533,
      "step": 975
    },
    {
      "epoch": 0.785700255352583,
      "grad_norm": 0.5730789303779602,
      "learning_rate": 4.755244755244756e-05,
      "loss": 0.7608,
      "step": 1000
    },
    {
      "epoch": 0.8053427617363975,
      "grad_norm": 0.5161120295524597,
      "learning_rate": 4.318181818181819e-05,
      "loss": 0.7644,
      "step": 1025
    },
    {
      "epoch": 0.8249852681202121,
      "grad_norm": 0.6298760175704956,
      "learning_rate": 3.8811188811188816e-05,
      "loss": 0.7678,
      "step": 1050
    },
    {
      "epoch": 0.8446277745040267,
      "grad_norm": 0.559695839881897,
      "learning_rate": 3.4440559440559445e-05,
      "loss": 0.7627,
      "step": 1075
    },
    {
      "epoch": 0.8642702808878413,
      "grad_norm": 0.5945947170257568,
      "learning_rate": 3.0069930069930068e-05,
      "loss": 0.7767,
      "step": 1100
    },
    {
      "epoch": 0.8839127872716559,
      "grad_norm": 0.5842404365539551,
      "learning_rate": 2.5699300699300697e-05,
      "loss": 0.7752,
      "step": 1125
    },
    {
      "epoch": 0.9035552936554705,
      "grad_norm": 0.5409468412399292,
      "learning_rate": 2.132867132867133e-05,
      "loss": 0.7667,
      "step": 1150
    },
    {
      "epoch": 0.923197800039285,
      "grad_norm": 0.6497332453727722,
      "learning_rate": 1.695804195804196e-05,
      "loss": 0.7817,
      "step": 1175
    },
    {
      "epoch": 0.9428403064230996,
      "grad_norm": 0.5824007987976074,
      "learning_rate": 1.2587412587412589e-05,
      "loss": 0.7951,
      "step": 1200
    },
    {
      "epoch": 0.9624828128069142,
      "grad_norm": 0.6233786940574646,
      "learning_rate": 8.216783216783217e-06,
      "loss": 0.7926,
      "step": 1225
    },
    {
      "epoch": 0.9821253191907288,
      "grad_norm": 0.5785284042358398,
      "learning_rate": 3.846153846153847e-06,
      "loss": 0.7632,
      "step": 1250
    }
  ],
  "logging_steps": 25,
  "max_steps": 1272,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "total_flos": 1.2402531158196224e+17,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}