quinnlue commited on
Commit
c9c0a46
·
verified ·
1 Parent(s): 0d66abf

Upload training_log_50pct_step6624.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_log_50pct_step6624.json +883 -0
training_log_50pct_step6624.json ADDED
@@ -0,0 +1,883 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "epochs": 1,
4
+ "batch_size": 16,
5
+ "gradient_accumulation_steps": 4,
6
+ "effective_batch_size": 64,
7
+ "cross_max_length": 64,
8
+ "lora_r": 8,
9
+ "lora_alpha": 16,
10
+ "target_modules": [
11
+ "query_key_value"
12
+ ],
13
+ "total_steps": 13249,
14
+ "start_time": "2026-01-11T03:45:55.374803"
15
+ },
16
+ "steps": [
17
+ {
18
+ "step": 100,
19
+ "epoch": 0,
20
+ "loss": 3.0553906083106996,
21
+ "perplexity": 21.229476317462176,
22
+ "grad_norm": 0.9999998862745418,
23
+ "lr": 1.8882175226586103e-06,
24
+ "cross_attn_scale": 0.0755287009063444,
25
+ "tokens_per_sec": 29659.667067018858,
26
+ "step_time_ms": 72.3056697845459,
27
+ "elapsed_sec": 11.671439170837402,
28
+ "progress_pct": 0.7547739452034116
29
+ },
30
+ {
31
+ "step": 200,
32
+ "epoch": 0,
33
+ "loss": 3.0449283146858215,
34
+ "perplexity": 21.00852514617355,
35
+ "grad_norm": 0.9999998771969448,
36
+ "lr": 3.7764350453172205e-06,
37
+ "cross_attn_scale": 0.1510574018126888,
38
+ "tokens_per_sec": 29998.955284666587,
39
+ "step_time_ms": 68.73083829879761,
40
+ "elapsed_sec": 22.878730058670044,
41
+ "progress_pct": 1.5095478904068231
42
+ },
43
+ {
44
+ "step": 300,
45
+ "epoch": 0,
46
+ "loss": 3.047647387981415,
47
+ "perplexity": 21.065726598105954,
48
+ "grad_norm": 0.9999998694186822,
49
+ "lr": 5.664652567975831e-06,
50
+ "cross_attn_scale": 0.22658610271903323,
51
+ "tokens_per_sec": 30055.703757973926,
52
+ "step_time_ms": 68.56269359588623,
53
+ "elapsed_sec": 33.98429822921753,
54
+ "progress_pct": 2.2643218356102346
55
+ },
56
+ {
57
+ "step": 400,
58
+ "epoch": 0,
59
+ "loss": 3.0421357417106627,
60
+ "perplexity": 20.949939147514087,
61
+ "grad_norm": 0.9999998708066421,
62
+ "lr": 7.552870090634441e-06,
63
+ "cross_attn_scale": 0.3021148036253776,
64
+ "tokens_per_sec": 30210.86789891559,
65
+ "step_time_ms": 68.70847702026367,
66
+ "elapsed_sec": 45.12389397621155,
67
+ "progress_pct": 3.0190957808136463
68
+ },
69
+ {
70
+ "step": 500,
71
+ "epoch": 0,
72
+ "loss": 3.0340122056007384,
73
+ "perplexity": 20.780440954816772,
74
+ "grad_norm": 0.9999998721696909,
75
+ "lr": 9.441087613293053e-06,
76
+ "cross_attn_scale": 0.3776435045317221,
77
+ "tokens_per_sec": 30287.550862030017,
78
+ "step_time_ms": 68.67650508880615,
79
+ "elapsed_sec": 56.31241059303284,
80
+ "progress_pct": 3.773869726017058
81
+ },
82
+ {
83
+ "step": 600,
84
+ "epoch": 0,
85
+ "loss": 3.0240328454971315,
86
+ "perplexity": 20.574096755393853,
87
+ "grad_norm": 0.9999998774025239,
88
+ "lr": 1.1329305135951662e-05,
89
+ "cross_attn_scale": 0.45317220543806647,
90
+ "tokens_per_sec": 30337.509813968918,
91
+ "step_time_ms": 68.6249566078186,
92
+ "elapsed_sec": 67.54812812805176,
93
+ "progress_pct": 4.528643671220469
94
+ },
95
+ {
96
+ "step": 700,
97
+ "epoch": 0,
98
+ "loss": 3.0103702998161315,
99
+ "perplexity": 20.294913736798694,
100
+ "grad_norm": 0.9999998854215264,
101
+ "lr": 1.3217522658610274e-05,
102
+ "cross_attn_scale": 0.5287009063444109,
103
+ "tokens_per_sec": 30390.876870190146,
104
+ "step_time_ms": 68.71308088302612,
105
+ "elapsed_sec": 78.77034974098206,
106
+ "progress_pct": 5.283417616423881
107
+ },
108
+ {
109
+ "step": 800,
110
+ "epoch": 0,
111
+ "loss": 3.005305087566376,
112
+ "perplexity": 20.192375599076605,
113
+ "grad_norm": 0.9999998881305956,
114
+ "lr": 1.5105740181268882e-05,
115
+ "cross_attn_scale": 0.6042296072507553,
116
+ "tokens_per_sec": 30399.323564352733,
117
+ "step_time_ms": 68.67244720458984,
118
+ "elapsed_sec": 89.90821766853333,
119
+ "progress_pct": 6.0381915616272925
120
+ },
121
+ {
122
+ "step": 900,
123
+ "epoch": 0,
124
+ "loss": 2.9808620524406435,
125
+ "perplexity": 19.704795892601492,
126
+ "grad_norm": 0.999999884566103,
127
+ "lr": 1.6993957703927492e-05,
128
+ "cross_attn_scale": 0.6797583081570997,
129
+ "tokens_per_sec": 30464.53898853465,
130
+ "step_time_ms": 68.634774684906,
131
+ "elapsed_sec": 101.1823616027832,
132
+ "progress_pct": 6.792965506830704
133
+ },
134
+ {
135
+ "step": 1000,
136
+ "epoch": 0,
137
+ "loss": 2.9657063841819764,
138
+ "perplexity": 19.408408192965855,
139
+ "grad_norm": 0.9999998912970435,
140
+ "lr": 1.8882175226586106e-05,
141
+ "cross_attn_scale": 0.7552870090634441,
142
+ "tokens_per_sec": 30482.017177706704,
143
+ "step_time_ms": 68.62752199172974,
144
+ "elapsed_sec": 112.40210843086243,
145
+ "progress_pct": 7.547739452034116
146
+ },
147
+ {
148
+ "step": 1100,
149
+ "epoch": 0,
150
+ "loss": 2.9404901576042177,
151
+ "perplexity": 18.925120330844894,
152
+ "grad_norm": 0.9999998957905454,
153
+ "lr": 2.0770392749244713e-05,
154
+ "cross_attn_scale": 0.8308157099697885,
155
+ "tokens_per_sec": 30520.18917730466,
156
+ "step_time_ms": 68.61571073532104,
157
+ "elapsed_sec": 123.61365056037903,
158
+ "progress_pct": 8.302513397237528
159
+ },
160
+ {
161
+ "step": 1200,
162
+ "epoch": 0,
163
+ "loss": 2.916158149242401,
164
+ "perplexity": 18.470191258417472,
165
+ "grad_norm": 0.9999999059134038,
166
+ "lr": 2.2658610271903323e-05,
167
+ "cross_attn_scale": 0.9063444108761329,
168
+ "tokens_per_sec": 30514.05504365179,
169
+ "step_time_ms": 68.67437601089478,
170
+ "elapsed_sec": 134.80070066452026,
171
+ "progress_pct": 9.057287342440938
172
+ },
173
+ {
174
+ "step": 1300,
175
+ "epoch": 0,
176
+ "loss": 2.9002408885955813,
177
+ "perplexity": 18.17852384113777,
178
+ "grad_norm": 0.9999999221452138,
179
+ "lr": 2.4546827794561937e-05,
180
+ "cross_attn_scale": 0.9818731117824774,
181
+ "tokens_per_sec": 30518.41110587493,
182
+ "step_time_ms": 68.52020263671875,
183
+ "elapsed_sec": 145.9552721977234,
184
+ "progress_pct": 9.812061287644351
185
+ },
186
+ {
187
+ "step": 1400,
188
+ "epoch": 0,
189
+ "loss": 2.872870879173279,
190
+ "perplexity": 17.687724699355996,
191
+ "grad_norm": 0.9999999227685785,
192
+ "lr": 2.6435045317220547e-05,
193
+ "cross_attn_scale": 1,
194
+ "tokens_per_sec": 30511.622849857926,
195
+ "step_time_ms": 68.70761156082153,
196
+ "elapsed_sec": 157.15584921836853,
197
+ "progress_pct": 10.566835232847762
198
+ },
199
+ {
200
+ "step": 1500,
201
+ "epoch": 0,
202
+ "loss": 2.8776234197616577,
203
+ "perplexity": 17.771986398880262,
204
+ "grad_norm": 0.9999999237417435,
205
+ "lr": 2.8323262839879154e-05,
206
+ "cross_attn_scale": 1,
207
+ "tokens_per_sec": 30493.903728302623,
208
+ "step_time_ms": 68.75503063201904,
209
+ "elapsed_sec": 168.2848823070526,
210
+ "progress_pct": 11.321609178051174
211
+ },
212
+ {
213
+ "step": 1600,
214
+ "epoch": 0,
215
+ "loss": 2.8588545417785642,
216
+ "perplexity": 17.44153693803145,
217
+ "grad_norm": 0.9999999207595216,
218
+ "lr": 3.0211480362537764e-05,
219
+ "cross_attn_scale": 1,
220
+ "tokens_per_sec": 30505.121130819396,
221
+ "step_time_ms": 68.64941120147705,
222
+ "elapsed_sec": 179.4573106765747,
223
+ "progress_pct": 12.076383123254585
224
+ },
225
+ {
226
+ "step": 1700,
227
+ "epoch": 0,
228
+ "loss": 2.8620681262016294,
229
+ "perplexity": 17.4976769464644,
230
+ "grad_norm": 0.9999999136486478,
231
+ "lr": 3.209969788519638e-05,
232
+ "cross_attn_scale": 1,
233
+ "tokens_per_sec": 30496.028601579812,
234
+ "step_time_ms": 68.41788530349731,
235
+ "elapsed_sec": 190.5410726070404,
236
+ "progress_pct": 12.831157068457998
237
+ },
238
+ {
239
+ "step": 1800,
240
+ "epoch": 0,
241
+ "loss": 2.851705029010773,
242
+ "perplexity": 17.317283153287743,
243
+ "grad_norm": 0.9999999101516841,
244
+ "lr": 3.3987915407854985e-05,
245
+ "cross_attn_scale": 1,
246
+ "tokens_per_sec": 30486.163746654358,
247
+ "step_time_ms": 68.65042924880981,
248
+ "elapsed_sec": 201.64810013771057,
249
+ "progress_pct": 13.585931013661408
250
+ },
251
+ {
252
+ "step": 1900,
253
+ "epoch": 0,
254
+ "loss": 2.839317078590393,
255
+ "perplexity": 17.104080804749838,
256
+ "grad_norm": 0.9999999094789626,
257
+ "lr": 3.5876132930513595e-05,
258
+ "cross_attn_scale": 1,
259
+ "tokens_per_sec": 30499.8027812949,
260
+ "step_time_ms": 68.49920272827148,
261
+ "elapsed_sec": 212.8020317554474,
262
+ "progress_pct": 14.34070495886482
263
+ },
264
+ {
265
+ "step": 2000,
266
+ "epoch": 0,
267
+ "loss": 2.833750886917114,
268
+ "perplexity": 17.009140685170195,
269
+ "grad_norm": 0.9999999050466546,
270
+ "lr": 3.776435045317221e-05,
271
+ "cross_attn_scale": 1,
272
+ "tokens_per_sec": 30498.478093732738,
273
+ "step_time_ms": 68.63363027572632,
274
+ "elapsed_sec": 223.98661923408508,
275
+ "progress_pct": 15.095478904068232
276
+ },
277
+ {
278
+ "step": 2100,
279
+ "epoch": 0,
280
+ "loss": 2.833835325241089,
281
+ "perplexity": 17.010576969139752,
282
+ "grad_norm": 0.9999999051555772,
283
+ "lr": 3.9652567975830816e-05,
284
+ "cross_attn_scale": 1,
285
+ "tokens_per_sec": 30496.763241222277,
286
+ "step_time_ms": 68.50829124450684,
287
+ "elapsed_sec": 235.1098358631134,
288
+ "progress_pct": 15.850252849271643
289
+ },
290
+ {
291
+ "step": 2200,
292
+ "epoch": 0,
293
+ "loss": 2.830011742115021,
294
+ "perplexity": 16.9456598012593,
295
+ "grad_norm": 0.9999999086205261,
296
+ "lr": 4.1540785498489426e-05,
297
+ "cross_attn_scale": 1,
298
+ "tokens_per_sec": 30504.708020780217,
299
+ "step_time_ms": 68.68033170700073,
300
+ "elapsed_sec": 246.3288288116455,
301
+ "progress_pct": 16.605026794475055
302
+ },
303
+ {
304
+ "step": 2300,
305
+ "epoch": 0,
306
+ "loss": 2.826364483833313,
307
+ "perplexity": 16.88396717606802,
308
+ "grad_norm": 0.9999999082911996,
309
+ "lr": 4.342900302114804e-05,
310
+ "cross_attn_scale": 1,
311
+ "tokens_per_sec": 30501.204524633395,
312
+ "step_time_ms": 68.73678207397461,
313
+ "elapsed_sec": 257.5635986328125,
314
+ "progress_pct": 17.359800739678466
315
+ },
316
+ {
317
+ "step": 2400,
318
+ "epoch": 0,
319
+ "loss": 2.8305963826179505,
320
+ "perplexity": 16.95556981694399,
321
+ "grad_norm": 0.9999999116385626,
322
+ "lr": 4.5317220543806646e-05,
323
+ "cross_attn_scale": 1,
324
+ "tokens_per_sec": 30500.794054619466,
325
+ "step_time_ms": 68.69770526885986,
326
+ "elapsed_sec": 268.69559478759766,
327
+ "progress_pct": 18.114574684881877
328
+ },
329
+ {
330
+ "step": 2500,
331
+ "epoch": 0,
332
+ "loss": 2.823909387588501,
333
+ "perplexity": 16.842566254075912,
334
+ "grad_norm": 0.9999999056484468,
335
+ "lr": 4.720543806646526e-05,
336
+ "cross_attn_scale": 1,
337
+ "tokens_per_sec": 30494.381494274585,
338
+ "step_time_ms": 68.81577491760254,
339
+ "elapsed_sec": 279.87601590156555,
340
+ "progress_pct": 18.86934863008529
341
+ },
342
+ {
343
+ "step": 2600,
344
+ "epoch": 0,
345
+ "loss": 2.8317417907714844,
346
+ "perplexity": 16.975001991621266,
347
+ "grad_norm": 0.9999999060478559,
348
+ "lr": 4.9093655589123874e-05,
349
+ "cross_attn_scale": 1,
350
+ "tokens_per_sec": 30493.046658925927,
351
+ "step_time_ms": 68.76836061477661,
352
+ "elapsed_sec": 291.05359983444214,
353
+ "progress_pct": 19.624122575288702
354
+ },
355
+ {
356
+ "step": 2700,
357
+ "epoch": 0,
358
+ "loss": 2.8178824305534365,
359
+ "perplexity": 16.741362113751652,
360
+ "grad_norm": 0.9999999106669464,
361
+ "lr": 5.098187311178248e-05,
362
+ "cross_attn_scale": 1,
363
+ "tokens_per_sec": 30497.10975803011,
364
+ "step_time_ms": 68.62552165985107,
365
+ "elapsed_sec": 302.2955641746521,
366
+ "progress_pct": 20.378896520492113
367
+ },
368
+ {
369
+ "step": 2800,
370
+ "epoch": 0,
371
+ "loss": 2.8133517289161682,
372
+ "perplexity": 16.665683564931342,
373
+ "grad_norm": 0.9999999143853615,
374
+ "lr": 5.2870090634441094e-05,
375
+ "cross_attn_scale": 1,
376
+ "tokens_per_sec": 30502.20993387066,
377
+ "step_time_ms": 68.7237548828125,
378
+ "elapsed_sec": 313.51184129714966,
379
+ "progress_pct": 21.133670465695523
380
+ },
381
+ {
382
+ "step": 2900,
383
+ "epoch": 0,
384
+ "loss": 2.8192299604415894,
385
+ "perplexity": 16.763936806188713,
386
+ "grad_norm": 0.9999999153165114,
387
+ "lr": 5.4758308157099705e-05,
388
+ "cross_attn_scale": 1,
389
+ "tokens_per_sec": 30495.965582028683,
390
+ "step_time_ms": 68.75383377075195,
391
+ "elapsed_sec": 324.68157052993774,
392
+ "progress_pct": 21.888444410898934
393
+ },
394
+ {
395
+ "step": 3000,
396
+ "epoch": 0,
397
+ "loss": 2.814334969520569,
398
+ "perplexity": 16.682078000228916,
399
+ "grad_norm": 0.9999999103789803,
400
+ "lr": 5.664652567975831e-05,
401
+ "cross_attn_scale": 1,
402
+ "tokens_per_sec": 30497.05613414167,
403
+ "step_time_ms": 68.89568090438843,
404
+ "elapsed_sec": 335.9107172489166,
405
+ "progress_pct": 22.64321835610235
406
+ },
407
+ {
408
+ "step": 3100,
409
+ "epoch": 0,
410
+ "loss": 2.812694685459137,
411
+ "perplexity": 16.654737083138976,
412
+ "grad_norm": 0.9999999103699377,
413
+ "lr": 5.853474320241692e-05,
414
+ "cross_attn_scale": 1,
415
+ "tokens_per_sec": 30503.323091845978,
416
+ "step_time_ms": 68.7230896949768,
417
+ "elapsed_sec": 347.12381887435913,
418
+ "progress_pct": 23.39799230130576
419
+ },
420
+ {
421
+ "step": 3200,
422
+ "epoch": 0,
423
+ "loss": 2.8209491848945616,
424
+ "perplexity": 16.79278256533621,
425
+ "grad_norm": 0.9999999116175871,
426
+ "lr": 6.042296072507553e-05,
427
+ "cross_attn_scale": 1,
428
+ "tokens_per_sec": 30471.149506940925,
429
+ "step_time_ms": 72.36854791641235,
430
+ "elapsed_sec": 358.6222107410431,
431
+ "progress_pct": 24.15276624650917
432
+ },
433
+ {
434
+ "step": 3300,
435
+ "epoch": 0,
436
+ "loss": 2.823811297416687,
437
+ "perplexity": 16.84091424488252,
438
+ "grad_norm": 0.9999999183970942,
439
+ "lr": 6.231117824773414e-05,
440
+ "cross_attn_scale": 1,
441
+ "tokens_per_sec": 30475.15581021754,
442
+ "step_time_ms": 68.7209701538086,
443
+ "elapsed_sec": 369.78560733795166,
444
+ "progress_pct": 24.90754019171258
445
+ },
446
+ {
447
+ "step": 3400,
448
+ "epoch": 0,
449
+ "loss": 2.8320892524719237,
450
+ "perplexity": 16.98090117948981,
451
+ "grad_norm": 0.9999999248059107,
452
+ "lr": 6.419939577039276e-05,
453
+ "cross_attn_scale": 1,
454
+ "tokens_per_sec": 29670.286366915152,
455
+ "step_time_ms": 68.97594928741455,
456
+ "elapsed_sec": 391.05166888237,
457
+ "progress_pct": 25.662314136915995
458
+ },
459
+ {
460
+ "step": 3500,
461
+ "epoch": 0,
462
+ "loss": 2.815865466594696,
463
+ "perplexity": 16.70762941999754,
464
+ "grad_norm": 0.9999999286289808,
465
+ "lr": 6.608761329305136e-05,
466
+ "cross_attn_scale": 1,
467
+ "tokens_per_sec": 29686.79626575453,
468
+ "step_time_ms": 69.19201135635376,
469
+ "elapsed_sec": 402.31939792633057,
470
+ "progress_pct": 26.417088082119406
471
+ },
472
+ {
473
+ "step": 3600,
474
+ "epoch": 0,
475
+ "loss": 2.846110601425171,
476
+ "perplexity": 17.220673356933453,
477
+ "grad_norm": 0.9999999358015611,
478
+ "lr": 6.797583081570997e-05,
479
+ "cross_attn_scale": 1,
480
+ "tokens_per_sec": 29707.146446527197,
481
+ "step_time_ms": 68.82575511932373,
482
+ "elapsed_sec": 413.6111161708832,
483
+ "progress_pct": 27.171862027322817
484
+ },
485
+ {
486
+ "step": 3700,
487
+ "epoch": 0,
488
+ "loss": 2.8651936769485475,
489
+ "perplexity": 17.55245238082157,
490
+ "grad_norm": 0.9999999406451835,
491
+ "lr": 6.986404833836858e-05,
492
+ "cross_attn_scale": 1,
493
+ "tokens_per_sec": 29731.035920613755,
494
+ "step_time_ms": 68.87631177902222,
495
+ "elapsed_sec": 424.8875832557678,
496
+ "progress_pct": 27.926635972526228
497
+ },
498
+ {
499
+ "step": 3800,
500
+ "epoch": 0,
501
+ "loss": 2.8802473044395445,
502
+ "perplexity": 17.81867927325758,
503
+ "grad_norm": 0.999999938128731,
504
+ "lr": 7.175226586102719e-05,
505
+ "cross_attn_scale": 1,
506
+ "tokens_per_sec": 29740.50111024254,
507
+ "step_time_ms": 68.9161491394043,
508
+ "elapsed_sec": 436.10438680648804,
509
+ "progress_pct": 28.68140991772964
510
+ },
511
+ {
512
+ "step": 3900,
513
+ "epoch": 0,
514
+ "loss": 2.8760248684883116,
515
+ "perplexity": 17.7435996622643,
516
+ "grad_norm": 0.9999999370933762,
517
+ "lr": 7.36404833836858e-05,
518
+ "cross_attn_scale": 1,
519
+ "tokens_per_sec": 29756.389974960584,
520
+ "step_time_ms": 68.69559288024902,
521
+ "elapsed_sec": 447.3066461086273,
522
+ "progress_pct": 29.436183862933053
523
+ },
524
+ {
525
+ "step": 4000,
526
+ "epoch": 0,
527
+ "loss": 2.833030352592468,
528
+ "perplexity": 16.996889429728867,
529
+ "grad_norm": 0.9999999344961908,
530
+ "lr": 7.552870090634442e-05,
531
+ "cross_attn_scale": 1,
532
+ "tokens_per_sec": 29775.895470741994,
533
+ "step_time_ms": 68.65659475326538,
534
+ "elapsed_sec": 458.5267641544342,
535
+ "progress_pct": 30.190957808136464
536
+ },
537
+ {
538
+ "step": 4100,
539
+ "epoch": 0,
540
+ "loss": 2.8502792358398437,
541
+ "perplexity": 17.29260988289878,
542
+ "grad_norm": 0.9999999411152237,
543
+ "lr": 7.741691842900302e-05,
544
+ "cross_attn_scale": 1,
545
+ "tokens_per_sec": 29789.044694580516,
546
+ "step_time_ms": 68.78466129302979,
547
+ "elapsed_sec": 469.70932245254517,
548
+ "progress_pct": 30.945731753339874
549
+ },
550
+ {
551
+ "step": 4200,
552
+ "epoch": 0,
553
+ "loss": 2.845259420871735,
554
+ "perplexity": 17.206021691146056,
555
+ "grad_norm": 0.9999999597855691,
556
+ "lr": 7.930513595166163e-05,
557
+ "cross_attn_scale": 1,
558
+ "tokens_per_sec": 29810.506707537246,
559
+ "step_time_ms": 68.6014461517334,
560
+ "elapsed_sec": 480.918527841568,
561
+ "progress_pct": 31.700505698543285
562
+ },
563
+ {
564
+ "step": 4300,
565
+ "epoch": 0,
566
+ "loss": 2.851608917713165,
567
+ "perplexity": 17.315618846713456,
568
+ "grad_norm": 0.9999999759995412,
569
+ "lr": 8.119335347432024e-05,
570
+ "cross_attn_scale": 1,
571
+ "tokens_per_sec": 29823.229438646857,
572
+ "step_time_ms": 68.48395347595215,
573
+ "elapsed_sec": 492.0337359905243,
574
+ "progress_pct": 32.4552796437467
575
+ },
576
+ {
577
+ "step": 4400,
578
+ "epoch": 0,
579
+ "loss": 2.8928983926773073,
580
+ "perplexity": 18.045536929430227,
581
+ "grad_norm": 0.9999999802354099,
582
+ "lr": 8.308157099697885e-05,
583
+ "cross_attn_scale": 1,
584
+ "tokens_per_sec": 29839.21784462899,
585
+ "step_time_ms": 68.63518953323364,
586
+ "elapsed_sec": 503.2420446872711,
587
+ "progress_pct": 33.21005358895011
588
+ },
589
+ {
590
+ "step": 4500,
591
+ "epoch": 0,
592
+ "loss": 2.8919497418403624,
593
+ "perplexity": 18.028426133087862,
594
+ "grad_norm": 0.9999999832088211,
595
+ "lr": 8.496978851963746e-05,
596
+ "cross_attn_scale": 1,
597
+ "tokens_per_sec": 29851.35171674971,
598
+ "step_time_ms": 68.82947206497192,
599
+ "elapsed_sec": 514.4808900356293,
600
+ "progress_pct": 33.96482753415352
601
+ },
602
+ {
603
+ "step": 4600,
604
+ "epoch": 0,
605
+ "loss": 2.912613160610199,
606
+ "perplexity": 18.404830560286392,
607
+ "grad_norm": 0.9999999852952282,
608
+ "lr": 8.685800604229609e-05,
609
+ "cross_attn_scale": 1,
610
+ "tokens_per_sec": 29868.9929982354,
611
+ "step_time_ms": 68.60780239105225,
612
+ "elapsed_sec": 525.6990418434143,
613
+ "progress_pct": 34.71960147935693
614
+ },
615
+ {
616
+ "step": 4700,
617
+ "epoch": 0,
618
+ "loss": 2.9198369789123535,
619
+ "perplexity": 18.538265085260896,
620
+ "grad_norm": 0.9999999861072904,
621
+ "lr": 8.874622356495468e-05,
622
+ "cross_attn_scale": 1,
623
+ "tokens_per_sec": 29881.651406257948,
624
+ "step_time_ms": 68.78975629806519,
625
+ "elapsed_sec": 536.9214298725128,
626
+ "progress_pct": 35.47437542456034
627
+ },
628
+ {
629
+ "step": 4800,
630
+ "epoch": 0,
631
+ "loss": 2.9220591092109682,
632
+ "perplexity": 18.57950532941875,
633
+ "grad_norm": 0.9999999939528834,
634
+ "lr": 9.063444108761329e-05,
635
+ "cross_attn_scale": 1,
636
+ "tokens_per_sec": 29894.195528337976,
637
+ "step_time_ms": 68.80178928375244,
638
+ "elapsed_sec": 548.1407246589661,
639
+ "progress_pct": 36.22914936976375
640
+ },
641
+ {
642
+ "step": 4900,
643
+ "epoch": 0,
644
+ "loss": 2.9370435547828673,
645
+ "perplexity": 18.86000523511343,
646
+ "grad_norm": 1.0000000011274732,
647
+ "lr": 9.25226586102719e-05,
648
+ "cross_attn_scale": 1,
649
+ "tokens_per_sec": 29899.872434533,
650
+ "step_time_ms": 68.81359100341797,
651
+ "elapsed_sec": 559.2738242149353,
652
+ "progress_pct": 36.983923314967164
653
+ },
654
+ {
655
+ "step": 5000,
656
+ "epoch": 0,
657
+ "loss": 2.9583297395706176,
658
+ "perplexity": 19.265766020381903,
659
+ "grad_norm": 0.9999999952426983,
660
+ "lr": 9.441087613293051e-05,
661
+ "cross_attn_scale": 1,
662
+ "tokens_per_sec": 29914.890276799215,
663
+ "step_time_ms": 68.60477924346924,
664
+ "elapsed_sec": 570.4796454906464,
665
+ "progress_pct": 37.73869726017058
666
+ },
667
+ {
668
+ "step": 5100,
669
+ "epoch": 0,
670
+ "loss": 2.9550197100639344,
671
+ "perplexity": 19.20210119074545,
672
+ "grad_norm": 0.9999999955847756,
673
+ "lr": 9.629909365558912e-05,
674
+ "cross_attn_scale": 1,
675
+ "tokens_per_sec": 29927.735715089886,
676
+ "step_time_ms": 68.55671644210815,
677
+ "elapsed_sec": 581.667058467865,
678
+ "progress_pct": 38.49347120537399
679
+ },
680
+ {
681
+ "step": 5200,
682
+ "epoch": 0,
683
+ "loss": 2.9828960919380187,
684
+ "perplexity": 19.744917015876656,
685
+ "grad_norm": 0.9999999973577894,
686
+ "lr": 9.818731117824775e-05,
687
+ "cross_attn_scale": 1,
688
+ "tokens_per_sec": 29941.160552589776,
689
+ "step_time_ms": 68.76250982284546,
690
+ "elapsed_sec": 592.8996963500977,
691
+ "progress_pct": 39.248245150577404
692
+ },
693
+ {
694
+ "step": 5300,
695
+ "epoch": 0,
696
+ "loss": 2.9812774753570555,
697
+ "perplexity": 19.712983416903363,
698
+ "grad_norm": 0.9999999977527527,
699
+ "lr": 9.999999828225707e-05,
700
+ "cross_attn_scale": 1,
701
+ "tokens_per_sec": 29953.48526766276,
702
+ "step_time_ms": 68.57383966445923,
703
+ "elapsed_sec": 604.1068956851959,
704
+ "progress_pct": 40.003019095780814
705
+ },
706
+ {
707
+ "step": 5400,
708
+ "epoch": 0,
709
+ "loss": 2.9874549293518067,
710
+ "perplexity": 19.835136373740465,
711
+ "grad_norm": 1.000000005050804,
712
+ "lr": 9.99988388103099e-05,
713
+ "cross_attn_scale": 1,
714
+ "tokens_per_sec": 29965.8283068889,
715
+ "step_time_ms": 68.63211393356323,
716
+ "elapsed_sec": 615.2640204429626,
717
+ "progress_pct": 40.757793040984225
718
+ },
719
+ {
720
+ "step": 5500,
721
+ "epoch": 0,
722
+ "loss": 2.9855289316177367,
723
+ "perplexity": 19.7969707113172,
724
+ "grad_norm": 1.0000000047012423,
725
+ "lr": 9.999553221781354e-05,
726
+ "cross_attn_scale": 1,
727
+ "tokens_per_sec": 29979.453810055362,
728
+ "step_time_ms": 68.55385541915894,
729
+ "elapsed_sec": 626.467483997345,
730
+ "progress_pct": 41.512566986187636
731
+ },
732
+ {
733
+ "step": 5600,
734
+ "epoch": 0,
735
+ "loss": 3.001195156574249,
736
+ "perplexity": 20.10955663548813,
737
+ "grad_norm": 1.0000000009146937,
738
+ "lr": 9.999007864819872e-05,
739
+ "cross_attn_scale": 1,
740
+ "tokens_per_sec": 29985.720598712633,
741
+ "step_time_ms": 68.79624128341675,
742
+ "elapsed_sec": 637.6670501232147,
743
+ "progress_pct": 42.26734093139105
744
+ },
745
+ {
746
+ "step": 5700,
747
+ "epoch": 0,
748
+ "loss": 2.9869230651855467,
749
+ "perplexity": 19.824589580449725,
750
+ "grad_norm": 0.9999999955026968,
751
+ "lr": 9.998247833802596e-05,
752
+ "cross_attn_scale": 1,
753
+ "tokens_per_sec": 30003.619229602187,
754
+ "step_time_ms": 68.55745553970337,
755
+ "elapsed_sec": 648.9761068820953,
756
+ "progress_pct": 43.02211487659446
757
+ },
758
+ {
759
+ "step": 5800,
760
+ "epoch": 0,
761
+ "loss": 2.992214176654816,
762
+ "perplexity": 19.929761687056967,
763
+ "grad_norm": 0.9999999920373142,
764
+ "lr": 9.997273161697535e-05,
765
+ "cross_attn_scale": 1,
766
+ "tokens_per_sec": 30007.238330623877,
767
+ "step_time_ms": 68.54370355606079,
768
+ "elapsed_sec": 660.0942339897156,
769
+ "progress_pct": 43.77688882179787
770
+ },
771
+ {
772
+ "step": 5900,
773
+ "epoch": 0,
774
+ "loss": 2.9899471187591553,
775
+ "perplexity": 19.884630939803454,
776
+ "grad_norm": 0.9999999905480129,
777
+ "lr": 9.996083890783225e-05,
778
+ "cross_attn_scale": 1,
779
+ "tokens_per_sec": 30016.763888893925,
780
+ "step_time_ms": 68.59871864318848,
781
+ "elapsed_sec": 671.2416126728058,
782
+ "progress_pct": 44.531662767001286
783
+ },
784
+ {
785
+ "step": 6000,
786
+ "epoch": 0,
787
+ "loss": 2.973553490638733,
788
+ "perplexity": 19.561307160862828,
789
+ "grad_norm": 0.9999999961927993,
790
+ "lr": 9.9946800726469e-05,
791
+ "cross_attn_scale": 1,
792
+ "tokens_per_sec": 30019.099421691335,
793
+ "step_time_ms": 68.57965469360352,
794
+ "elapsed_sec": 682.3533148765564,
795
+ "progress_pct": 45.2864367122047
796
+ },
797
+ {
798
+ "step": 6100,
799
+ "epoch": 0,
800
+ "loss": 2.9827182602882387,
801
+ "perplexity": 19.74140605689802,
802
+ "grad_norm": 0.9999999982933864,
803
+ "lr": 9.993061768182244e-05,
804
+ "cross_attn_scale": 1,
805
+ "tokens_per_sec": 30026.30841714172,
806
+ "step_time_ms": 68.58452320098877,
807
+ "elapsed_sec": 693.5168223381042,
808
+ "progress_pct": 46.04121065740811
809
+ },
810
+ {
811
+ "step": 6200,
812
+ "epoch": 0,
813
+ "loss": 2.988657066822052,
814
+ "perplexity": 19.858995272367505,
815
+ "grad_norm": 0.9999999972547158,
816
+ "lr": 9.991229047586758e-05,
817
+ "cross_attn_scale": 1,
818
+ "tokens_per_sec": 30032.30921841359,
819
+ "step_time_ms": 68.62870931625366,
820
+ "elapsed_sec": 704.7209339141846,
821
+ "progress_pct": 46.79598460261152
822
+ },
823
+ {
824
+ "step": 6300,
825
+ "epoch": 0,
826
+ "loss": 3.0013710594177248,
827
+ "perplexity": 20.11309427481263,
828
+ "grad_norm": 1.0000000035538437,
829
+ "lr": 9.989181990358713e-05,
830
+ "cross_attn_scale": 1,
831
+ "tokens_per_sec": 30027.30892734739,
832
+ "step_time_ms": 72.44897842407227,
833
+ "elapsed_sec": 716.3112769126892,
834
+ "progress_pct": 47.55075854781493
835
+ },
836
+ {
837
+ "step": 6400,
838
+ "epoch": 0,
839
+ "loss": 3.0535635590553283,
840
+ "perplexity": 21.1907244301477,
841
+ "grad_norm": 1.0000000054736338,
842
+ "lr": 9.9869206852937e-05,
843
+ "cross_attn_scale": 1,
844
+ "tokens_per_sec": 30031.377185455563,
845
+ "step_time_ms": 68.55860948562622,
846
+ "elapsed_sec": 727.4697349071503,
847
+ "progress_pct": 48.30553249301834
848
+ },
849
+ {
850
+ "step": 6500,
851
+ "epoch": 0,
852
+ "loss": 3.0514983248710634,
853
+ "perplexity": 21.147005781827758,
854
+ "grad_norm": 1.0000000088419636,
855
+ "lr": 9.984445230480777e-05,
856
+ "cross_attn_scale": 1,
857
+ "tokens_per_sec": 30037.350771694248,
858
+ "step_time_ms": 68.51820468902588,
859
+ "elapsed_sec": 738.6134738922119,
860
+ "progress_pct": 49.06030643822175
861
+ },
862
+ {
863
+ "step": 6600,
864
+ "epoch": 0,
865
+ "loss": 3.073709189891815,
866
+ "perplexity": 21.62195405375797,
867
+ "grad_norm": 1.0000000080646183,
868
+ "lr": 9.981755733298221e-05,
869
+ "cross_attn_scale": 1,
870
+ "tokens_per_sec": 30044.33637658769,
871
+ "step_time_ms": 68.57419729232788,
872
+ "elapsed_sec": 749.8031148910522,
873
+ "progress_pct": 49.81508038342516
874
+ }
875
+ ],
876
+ "checkpoint": {
877
+ "pct": 50,
878
+ "step": 6624,
879
+ "final_loss": 3.068923241851031,
880
+ "total_tokens": 22608466,
881
+ "total_time_sec": 752.628674030304
882
+ }
883
+ }