Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.96, | |
| "eval_steps": 500, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 14.472631454467773, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 5.6365, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": NaN, | |
| "learning_rate": 6e-06, | |
| "loss": 7.9556, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 9.6599702835083, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 6.5246, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 40.07120895385742, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 7.7818, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 5.662575721740723, | |
| "learning_rate": 2.1e-05, | |
| "loss": 5.5899, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": NaN, | |
| "learning_rate": 2.5e-05, | |
| "loss": 7.1589, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 7.6021246910095215, | |
| "learning_rate": 3e-05, | |
| "loss": 4.9643, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 7.397390365600586, | |
| "learning_rate": 3.5e-05, | |
| "loss": 5.944, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 10.122846603393555, | |
| "learning_rate": 4e-05, | |
| "loss": 4.2179, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 7.520490646362305, | |
| "learning_rate": 4.5e-05, | |
| "loss": 3.9576, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 12.957700729370117, | |
| "learning_rate": 5e-05, | |
| "loss": 4.6752, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 11.252644538879395, | |
| "learning_rate": 4.999948617395915e-05, | |
| "loss": 4.586, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 14.29404354095459, | |
| "learning_rate": 4.9997944716957985e-05, | |
| "loss": 4.2009, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 12.03260612487793, | |
| "learning_rate": 4.9995375692359755e-05, | |
| "loss": 4.7383, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 7.339493751525879, | |
| "learning_rate": 4.9991779205767e-05, | |
| "loss": 4.0294, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 5.51794958114624, | |
| "learning_rate": 4.99871554050172e-05, | |
| "loss": 3.8225, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 10.195987701416016, | |
| "learning_rate": 4.9981504480176696e-05, | |
| "loss": 4.1299, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 14.785408973693848, | |
| "learning_rate": 4.997482666353287e-05, | |
| "loss": 4.1754, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 9.586732864379883, | |
| "learning_rate": 4.996712222958461e-05, | |
| "loss": 3.2669, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 25.427532196044922, | |
| "learning_rate": 4.9958391495031026e-05, | |
| "loss": 4.1074, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 7.9541730880737305, | |
| "learning_rate": 4.994863481875841e-05, | |
| "loss": 4.2565, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 8.371864318847656, | |
| "learning_rate": 4.993785260182552e-05, | |
| "loss": 3.3137, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 9.015960693359375, | |
| "learning_rate": 4.992604528744705e-05, | |
| "loss": 3.4324, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 6.063784599304199, | |
| "learning_rate": 4.991321336097546e-05, | |
| "loss": 3.3621, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 7.260740280151367, | |
| "learning_rate": 4.989935734988098e-05, | |
| "loss": 4.7443, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 9.744063377380371, | |
| "learning_rate": 4.9884477823729956e-05, | |
| "loss": 4.085, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 9.156418800354004, | |
| "learning_rate": 4.986857539416144e-05, | |
| "loss": 3.5387, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 7.296212673187256, | |
| "learning_rate": 4.9851650714862006e-05, | |
| "loss": 3.348, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 8.042045593261719, | |
| "learning_rate": 4.983370448153896e-05, | |
| "loss": 3.5621, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 12.138426780700684, | |
| "learning_rate": 4.981473743189163e-05, | |
| "loss": 3.4145, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 12.3366060256958, | |
| "learning_rate": 4.979475034558115e-05, | |
| "loss": 3.9396, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 6.696476936340332, | |
| "learning_rate": 4.977374404419837e-05, | |
| "loss": 3.5372, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.023372173309326, | |
| "learning_rate": 4.975171939123005e-05, | |
| "loss": 3.2419, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 7.861164093017578, | |
| "learning_rate": 4.9728677292023405e-05, | |
| "loss": 3.7565, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.757652282714844, | |
| "learning_rate": 4.970461869374889e-05, | |
| "loss": 3.2182, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 9.434078216552734, | |
| "learning_rate": 4.967954458536126e-05, | |
| "loss": 3.229, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 27.395694732666016, | |
| "learning_rate": 4.965345599755887e-05, | |
| "loss": 3.903, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6.09558629989624, | |
| "learning_rate": 4.962635400274142e-05, | |
| "loss": 3.1898, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.992712497711182, | |
| "learning_rate": 4.959823971496574e-05, | |
| "loss": 3.2185, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 7.1479411125183105, | |
| "learning_rate": 4.95691142899001e-05, | |
| "loss": 3.2792, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 8.431621551513672, | |
| "learning_rate": 4.9538978924776634e-05, | |
| "loss": 3.1418, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 23.00255584716797, | |
| "learning_rate": 4.9507834858342186e-05, | |
| "loss": 3.9976, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 10.47463321685791, | |
| "learning_rate": 4.9475683370807326e-05, | |
| "loss": 3.4157, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 18.375465393066406, | |
| "learning_rate": 4.9442525783793794e-05, | |
| "loss": 3.5929, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 8.78403091430664, | |
| "learning_rate": 4.940836346028011e-05, | |
| "loss": 3.1698, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 7.213769912719727, | |
| "learning_rate": 4.937319780454559e-05, | |
| "loss": 3.9643, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 7.919612884521484, | |
| "learning_rate": 4.933703026211262e-05, | |
| "loss": 3.6494, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.598107814788818, | |
| "learning_rate": 4.9299862319687204e-05, | |
| "loss": 3.5917, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 37.90977096557617, | |
| "learning_rate": 4.926169550509787e-05, | |
| "loss": 3.7991, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 12.698101997375488, | |
| "learning_rate": 4.9222531387232885e-05, | |
| "loss": 3.5116, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 9.609795570373535, | |
| "learning_rate": 4.9182371575975736e-05, | |
| "loss": 3.4126, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 8.108003616333008, | |
| "learning_rate": 4.914121772213898e-05, | |
| "loss": 3.2962, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 11.94726848602295, | |
| "learning_rate": 4.909907151739633e-05, | |
| "loss": 3.6953, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 8.730710983276367, | |
| "learning_rate": 4.905593469421323e-05, | |
| "loss": 3.4841, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.423217296600342, | |
| "learning_rate": 4.9011809025775486e-05, | |
| "loss": 3.1157, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 10.736145973205566, | |
| "learning_rate": 4.8966696325916515e-05, | |
| "loss": 3.244, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 21.597434997558594, | |
| "learning_rate": 4.892059844904272e-05, | |
| "loss": 3.172, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 23.540430068969727, | |
| "learning_rate": 4.887351729005726e-05, | |
| "loss": 3.1889, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 11.286609649658203, | |
| "learning_rate": 4.882545478428218e-05, | |
| "loss": 3.5211, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.204074382781982, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 3.255, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 25.35500717163086, | |
| "learning_rate": 4.8726393675266716e-05, | |
| "loss": 3.4832, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 6.76464319229126, | |
| "learning_rate": 4.8675399144040537e-05, | |
| "loss": 2.9075, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 10.19654655456543, | |
| "learning_rate": 4.862343140988573e-05, | |
| "loss": 3.0107, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 7.545323848724365, | |
| "learning_rate": 4.8570492608992325e-05, | |
| "loss": 2.7479, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 15.334161758422852, | |
| "learning_rate": 4.851658491746707e-05, | |
| "loss": 2.8654, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 14.28708553314209, | |
| "learning_rate": 4.846171055124401e-05, | |
| "loss": 3.1851, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 22.403162002563477, | |
| "learning_rate": 4.8405871765993433e-05, | |
| "loss": 2.8862, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.840334415435791, | |
| "learning_rate": 4.834907085702908e-05, | |
| "loss": 2.968, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 12.307807922363281, | |
| "learning_rate": 4.829131015921385e-05, | |
| "loss": 2.9458, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 12.92584228515625, | |
| "learning_rate": 4.82325920468638e-05, | |
| "loss": 3.4169, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 13.389299392700195, | |
| "learning_rate": 4.817291893365055e-05, | |
| "loss": 3.1601, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 16.73059844970703, | |
| "learning_rate": 4.8112293272502043e-05, | |
| "loss": 3.2846, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 8.608989715576172, | |
| "learning_rate": 4.805071755550177e-05, | |
| "loss": 3.0091, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.731265544891357, | |
| "learning_rate": 4.7988194313786275e-05, | |
| "loss": 3.3688, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 11.152880668640137, | |
| "learning_rate": 4.7924726117441135e-05, | |
| "loss": 3.3177, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 8.759363174438477, | |
| "learning_rate": 4.7860315575395316e-05, | |
| "loss": 2.9184, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 37.01634216308594, | |
| "learning_rate": 4.7794965335313926e-05, | |
| "loss": 3.1494, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.318065643310547, | |
| "learning_rate": 4.772867808348938e-05, | |
| "loss": 3.7067, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 14.883944511413574, | |
| "learning_rate": 4.766145654473095e-05, | |
| "loss": 2.9832, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 35.67652130126953, | |
| "learning_rate": 4.759330348225284e-05, | |
| "loss": 3.6382, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.075866222381592, | |
| "learning_rate": 4.752422169756048e-05, | |
| "loss": 3.6166, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 10.807673454284668, | |
| "learning_rate": 4.745421403033548e-05, | |
| "loss": 2.896, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 17.647350311279297, | |
| "learning_rate": 4.738328335831883e-05, | |
| "loss": 2.9218, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 35.758544921875, | |
| "learning_rate": 4.731143259719265e-05, | |
| "loss": 3.0896, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 19.15152931213379, | |
| "learning_rate": 4.72386647004603e-05, | |
| "loss": 3.0875, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 10.721521377563477, | |
| "learning_rate": 4.716498265932501e-05, | |
| "loss": 2.833, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 36.97370147705078, | |
| "learning_rate": 4.709038950256688e-05, | |
| "loss": 3.3553, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 18.96316146850586, | |
| "learning_rate": 4.701488829641845e-05, | |
| "loss": 2.8189, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 8.662888526916504, | |
| "learning_rate": 4.693848214443858e-05, | |
| "loss": 3.4443, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 18.23220443725586, | |
| "learning_rate": 4.686117418738489e-05, | |
| "loss": 3.5717, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 39.92061996459961, | |
| "learning_rate": 4.678296760308474e-05, | |
| "loss": 3.2364, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 9.42786693572998, | |
| "learning_rate": 4.6703865606304465e-05, | |
| "loss": 2.9595, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.904202938079834, | |
| "learning_rate": 4.662387144861734e-05, | |
| "loss": 3.1582, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 26.168556213378906, | |
| "learning_rate": 4.6542988418269876e-05, | |
| "loss": 3.3465, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 10.387030601501465, | |
| "learning_rate": 4.6461219840046654e-05, | |
| "loss": 3.0016, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 9.973118782043457, | |
| "learning_rate": 4.637856907513366e-05, | |
| "loss": 3.2299, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 31.876556396484375, | |
| "learning_rate": 4.629503952098011e-05, | |
| "loss": 3.3682, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 23.903682708740234, | |
| "learning_rate": 4.6210634611158816e-05, | |
| "loss": 2.7361, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.048882007598877, | |
| "learning_rate": 4.612535781522504e-05, | |
| "loss": 2.8006, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 16.777807235717773, | |
| "learning_rate": 4.6039212638573833e-05, | |
| "loss": 2.7768, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 35.86275100708008, | |
| "learning_rate": 4.595220262229601e-05, | |
| "loss": 3.2371, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 18.5112361907959, | |
| "learning_rate": 4.586433134303257e-05, | |
| "loss": 3.9903, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 15.84546947479248, | |
| "learning_rate": 4.5775602412827604e-05, | |
| "loss": 3.2624, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 9.097062110900879, | |
| "learning_rate": 4.5686019478979915e-05, | |
| "loss": 3.036, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 12.135428428649902, | |
| "learning_rate": 4.559558622389304e-05, | |
| "loss": 3.0802, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 7.016368865966797, | |
| "learning_rate": 4.55043063649239e-05, | |
| "loss": 2.718, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 7.902190685272217, | |
| "learning_rate": 4.5412183654229965e-05, | |
| "loss": 2.9923, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 9.023144721984863, | |
| "learning_rate": 4.531922187861507e-05, | |
| "loss": 3.2273, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 12.821276664733887, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 3.0269, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 11.519277572631836, | |
| "learning_rate": 4.51307964521339e-05, | |
| "loss": 2.9414, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.272705554962158, | |
| "learning_rate": 4.503534054669892e-05, | |
| "loss": 3.1369, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 15.541712760925293, | |
| "learning_rate": 4.493906106688712e-05, | |
| "loss": 3.3594, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 29.176177978515625, | |
| "learning_rate": 4.484196197037082e-05, | |
| "loss": 3.7387, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.822291374206543, | |
| "learning_rate": 4.474404724851356e-05, | |
| "loss": 3.0015, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 13.846043586730957, | |
| "learning_rate": 4.4645320926206064e-05, | |
| "loss": 2.8736, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.161740303039551, | |
| "learning_rate": 4.454578706170075e-05, | |
| "loss": 3.1031, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 12.11698055267334, | |
| "learning_rate": 4.444544974644493e-05, | |
| "loss": 3.1832, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.144786834716797, | |
| "learning_rate": 4.434431310491267e-05, | |
| "loss": 2.7414, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 14.706486701965332, | |
| "learning_rate": 4.4242381294435154e-05, | |
| "loss": 2.7289, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 11.251163482666016, | |
| "learning_rate": 4.413965850502987e-05, | |
| "loss": 3.4787, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 11.005367279052734, | |
| "learning_rate": 4.4036148959228365e-05, | |
| "loss": 3.1987, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 13.336194038391113, | |
| "learning_rate": 4.393185691190264e-05, | |
| "loss": 2.9382, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 9.259475708007812, | |
| "learning_rate": 4.382678665009028e-05, | |
| "loss": 3.1349, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 20.902462005615234, | |
| "learning_rate": 4.372094249281821e-05, | |
| "loss": 3.1365, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 13.907161712646484, | |
| "learning_rate": 4.3614328790925177e-05, | |
| "loss": 3.3361, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.845438480377197, | |
| "learning_rate": 4.350694992688289e-05, | |
| "loss": 3.1903, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 14.879186630249023, | |
| "learning_rate": 4.3398810314615876e-05, | |
| "loss": 3.5133, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 10.454853057861328, | |
| "learning_rate": 4.3289914399320034e-05, | |
| "loss": 3.0928, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 10.692648887634277, | |
| "learning_rate": 4.318026665727993e-05, | |
| "loss": 3.1065, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.677508354187012, | |
| "learning_rate": 4.306987159568479e-05, | |
| "loss": 2.6484, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 7.058112621307373, | |
| "learning_rate": 4.2958733752443195e-05, | |
| "loss": 2.6531, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 7.029059886932373, | |
| "learning_rate": 4.284685769599658e-05, | |
| "loss": 3.3165, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 9.306894302368164, | |
| "learning_rate": 4.273424802513145e-05, | |
| "loss": 3.2707, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.9906392097473145, | |
| "learning_rate": 4.262090936879029e-05, | |
| "loss": 3.0822, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 10.101268768310547, | |
| "learning_rate": 4.250684638588138e-05, | |
| "loss": 3.3994, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 13.360941886901855, | |
| "learning_rate": 4.239206376508717e-05, | |
| "loss": 3.2358, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 25.73331642150879, | |
| "learning_rate": 4.227656622467162e-05, | |
| "loss": 3.1359, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.391608715057373, | |
| "learning_rate": 4.216035851228626e-05, | |
| "loss": 3.1343, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 21.201011657714844, | |
| "learning_rate": 4.204344540477499e-05, | |
| "loss": 3.0807, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 12.530668258666992, | |
| "learning_rate": 4.192583170797774e-05, | |
| "loss": 2.9729, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 17.209762573242188, | |
| "learning_rate": 4.180752225653292e-05, | |
| "loss": 2.9911, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 16.309785842895508, | |
| "learning_rate": 4.16885219136787e-05, | |
| "loss": 3.0701, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.244511127471924, | |
| "learning_rate": 4.1568835571053075e-05, | |
| "loss": 2.8169, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.627272605895996, | |
| "learning_rate": 4.144846814849282e-05, | |
| "loss": 3.0518, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 9.72758674621582, | |
| "learning_rate": 4.132742459383122e-05, | |
| "loss": 3.1135, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 14.121967315673828, | |
| "learning_rate": 4.120570988269472e-05, | |
| "loss": 2.9481, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 71.68965148925781, | |
| "learning_rate": 4.108332901829836e-05, | |
| "loss": 3.7509, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 22.303184509277344, | |
| "learning_rate": 4.096028703124014e-05, | |
| "loss": 2.9864, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 11.789464950561523, | |
| "learning_rate": 4.083658897929426e-05, | |
| "loss": 2.7573, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 21.576683044433594, | |
| "learning_rate": 4.071223994720309e-05, | |
| "loss": 3.4365, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 7.004244804382324, | |
| "learning_rate": 4.058724504646834e-05, | |
| "loss": 3.1543, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 8.743468284606934, | |
| "learning_rate": 4.046160941514079e-05, | |
| "loss": 2.638, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 9.981857299804688, | |
| "learning_rate": 4.033533821760917e-05, | |
| "loss": 2.4701, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 11.454618453979492, | |
| "learning_rate": 4.0208436644387834e-05, | |
| "loss": 3.4152, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 22.861881256103516, | |
| "learning_rate": 4.008090991190341e-05, | |
| "loss": 3.0526, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 34.58321762084961, | |
| "learning_rate": 3.9952763262280405e-05, | |
| "loss": 3.1283, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 12.163683891296387, | |
| "learning_rate": 3.982400196312564e-05, | |
| "loss": 2.8686, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 34.78618240356445, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 2.782, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 11.429190635681152, | |
| "learning_rate": 3.95646566127599e-05, | |
| "loss": 3.1518, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.942728519439697, | |
| "learning_rate": 3.943408322222049e-05, | |
| "loss": 3.1183, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.556036949157715, | |
| "learning_rate": 3.9302916503054246e-05, | |
| "loss": 2.9241, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.161313533782959, | |
| "learning_rate": 3.917116184701125e-05, | |
| "loss": 2.8394, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 17.556371688842773, | |
| "learning_rate": 3.903882467000937e-05, | |
| "loss": 2.667, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.0064167976379395, | |
| "learning_rate": 3.8905910411911625e-05, | |
| "loss": 2.8605, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 13.329277038574219, | |
| "learning_rate": 3.8772424536302564e-05, | |
| "loss": 2.8765, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 7.7756876945495605, | |
| "learning_rate": 3.8638372530263715e-05, | |
| "loss": 2.6558, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.505970478057861, | |
| "learning_rate": 3.850375990414801e-05, | |
| "loss": 2.6178, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 29.54509925842285, | |
| "learning_rate": 3.836859219135324e-05, | |
| "loss": 2.9655, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 12.92197322845459, | |
| "learning_rate": 3.823287494809469e-05, | |
| "loss": 2.7825, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 10.073705673217773, | |
| "learning_rate": 3.8096613753176634e-05, | |
| "loss": 3.3055, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 23.254878997802734, | |
| "learning_rate": 3.7959814207763135e-05, | |
| "loss": 2.906, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 16.555452346801758, | |
| "learning_rate": 3.782248193514766e-05, | |
| "loss": 3.2928, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 8.429189682006836, | |
| "learning_rate": 3.7684622580522055e-05, | |
| "loss": 2.6259, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 9.762368202209473, | |
| "learning_rate": 3.7546241810744445e-05, | |
| "loss": 2.7576, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 11.004511833190918, | |
| "learning_rate": 3.740734531410626e-05, | |
| "loss": 2.6714, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 56.73052978515625, | |
| "learning_rate": 3.726793880009845e-05, | |
| "loss": 3.542, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 10.330306053161621, | |
| "learning_rate": 3.7128027999176803e-05, | |
| "loss": 3.0011, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 12.130693435668945, | |
| "learning_rate": 3.698761866252635e-05, | |
| "loss": 3.0935, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 12.528786659240723, | |
| "learning_rate": 3.6846716561824965e-05, | |
| "loss": 3.3687, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 14.122909545898438, | |
| "learning_rate": 3.670532748900615e-05, | |
| "loss": 2.5591, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 8.046327590942383, | |
| "learning_rate": 3.656345725602089e-05, | |
| "loss": 2.3845, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 22.46470832824707, | |
| "learning_rate": 3.642111169459879e-05, | |
| "loss": 2.8321, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 10.554203987121582, | |
| "learning_rate": 3.6278296656008366e-05, | |
| "loss": 2.4067, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 8.757329940795898, | |
| "learning_rate": 3.6135018010816477e-05, | |
| "loss": 2.7003, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 8.956037521362305, | |
| "learning_rate": 3.599128164864706e-05, | |
| "loss": 2.7975, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 15.920394897460938, | |
| "learning_rate": 3.5847093477938956e-05, | |
| "loss": 2.7154, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 15.14090633392334, | |
| "learning_rate": 3.570245942570315e-05, | |
| "loss": 3.3547, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 17.72629165649414, | |
| "learning_rate": 3.5557385437279e-05, | |
| "loss": 2.7852, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.857475757598877, | |
| "learning_rate": 3.5411877476089975e-05, | |
| "loss": 2.73, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 22.337295532226562, | |
| "learning_rate": 3.526594152339845e-05, | |
| "loss": 3.1076, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 10.523070335388184, | |
| "learning_rate": 3.5119583578059846e-05, | |
| "loss": 3.1857, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 11.635588645935059, | |
| "learning_rate": 3.497280965627605e-05, | |
| "loss": 2.8584, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 27.29017448425293, | |
| "learning_rate": 3.4825625791348096e-05, | |
| "loss": 3.3567, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 25.24538230895996, | |
| "learning_rate": 3.467803803342821e-05, | |
| "loss": 3.1713, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 11.2062406539917, | |
| "learning_rate": 3.4530052449271044e-05, | |
| "loss": 2.8941, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 20.25462532043457, | |
| "learning_rate": 3.438167512198436e-05, | |
| "loss": 3.1291, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 13.0928373336792, | |
| "learning_rate": 3.4232912150778914e-05, | |
| "loss": 2.4672, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 21.326560974121094, | |
| "learning_rate": 3.408376965071779e-05, | |
| "loss": 3.0489, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 54.281578063964844, | |
| "learning_rate": 3.393425375246503e-05, | |
| "loss": 3.0251, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 18.1798038482666, | |
| "learning_rate": 3.378437060203357e-05, | |
| "loss": 3.0152, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 8.757742881774902, | |
| "learning_rate": 3.363412636053269e-05, | |
| "loss": 2.5223, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.601003170013428, | |
| "learning_rate": 3.348352720391469e-05, | |
| "loss": 2.8158, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 16.87445068359375, | |
| "learning_rate": 3.3332579322721046e-05, | |
| "loss": 3.3828, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 11.726461410522461, | |
| "learning_rate": 3.318128892182792e-05, | |
| "loss": 2.5411, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 11.669478416442871, | |
| "learning_rate": 3.3029662220191144e-05, | |
| "loss": 2.6686, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 58.972007751464844, | |
| "learning_rate": 3.2877705450590526e-05, | |
| "loss": 3.3026, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 7.434502124786377, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 2.6652, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 22.020599365234375, | |
| "learning_rate": 3.2572826706199305e-05, | |
| "loss": 3.1553, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 17.60317611694336, | |
| "learning_rate": 3.2419917263779766e-05, | |
| "loss": 3.2016, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 15.09745979309082, | |
| "learning_rate": 3.2266702817623346e-05, | |
| "loss": 2.6242, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 11.325228691101074, | |
| "learning_rate": 3.211318966577581e-05, | |
| "loss": 2.9649, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 16.898090362548828, | |
| "learning_rate": 3.195938411856159e-05, | |
| "loss": 2.4623, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 8.761795043945312, | |
| "learning_rate": 3.180529249832428e-05, | |
| "loss": 2.4152, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 11.047560691833496, | |
| "learning_rate": 3.165092113916688e-05, | |
| "loss": 3.0119, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 11.61601734161377, | |
| "learning_rate": 3.149627638669132e-05, | |
| "loss": 2.4781, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 21.56399154663086, | |
| "learning_rate": 3.1341364597737686e-05, | |
| "loss": 3.4005, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 15.666382789611816, | |
| "learning_rate": 3.118619214012286e-05, | |
| "loss": 2.9385, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 11.866064071655273, | |
| "learning_rate": 3.1030765392378816e-05, | |
| "loss": 3.0096, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 11.25349235534668, | |
| "learning_rate": 3.0875090743490384e-05, | |
| "loss": 2.0702, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 10.070486068725586, | |
| "learning_rate": 3.071917459263264e-05, | |
| "loss": 3.0652, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 7.047021389007568, | |
| "learning_rate": 3.056302334890786e-05, | |
| "loss": 2.8617, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 11.59208869934082, | |
| "learning_rate": 3.040664343108209e-05, | |
| "loss": 2.8697, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 9.361946105957031, | |
| "learning_rate": 3.0250041267321232e-05, | |
| "loss": 2.7646, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 11.60400390625, | |
| "learning_rate": 3.0093223294926892e-05, | |
| "loss": 2.8238, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 19.50318145751953, | |
| "learning_rate": 2.993619596007168e-05, | |
| "loss": 2.8857, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 24.245025634765625, | |
| "learning_rate": 2.9778965717534313e-05, | |
| "loss": 2.949, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 14.323025703430176, | |
| "learning_rate": 2.962153903043422e-05, | |
| "loss": 2.6237, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 13.869138717651367, | |
| "learning_rate": 2.9463922369965917e-05, | |
| "loss": 2.5021, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 9.667675018310547, | |
| "learning_rate": 2.9306122215132976e-05, | |
| "loss": 2.9346, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 22.312841415405273, | |
| "learning_rate": 2.91481450524817e-05, | |
| "loss": 2.7172, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 10.218385696411133, | |
| "learning_rate": 2.8989997375834482e-05, | |
| "loss": 2.4864, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 7.388011455535889, | |
| "learning_rate": 2.8831685686022897e-05, | |
| "loss": 2.7519, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 8.172524452209473, | |
| "learning_rate": 2.8673216490620452e-05, | |
| "loss": 2.701, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 11.60141372680664, | |
| "learning_rate": 2.8514596303675073e-05, | |
| "loss": 3.1608, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 21.537139892578125, | |
| "learning_rate": 2.8355831645441388e-05, | |
| "loss": 2.9307, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 12.06334400177002, | |
| "learning_rate": 2.8196929042112652e-05, | |
| "loss": 2.9414, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 8.39703369140625, | |
| "learning_rate": 2.8037895025552512e-05, | |
| "loss": 2.938, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 8.78774642944336, | |
| "learning_rate": 2.787873613302649e-05, | |
| "loss": 2.5751, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 24.190916061401367, | |
| "learning_rate": 2.7719458906933277e-05, | |
| "loss": 2.328, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 17.80084228515625, | |
| "learning_rate": 2.7560069894535784e-05, | |
| "loss": 2.7258, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 17.03345489501953, | |
| "learning_rate": 2.7400575647692046e-05, | |
| "loss": 2.8104, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 36.040897369384766, | |
| "learning_rate": 2.724098272258584e-05, | |
| "loss": 2.9138, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 9.431939125061035, | |
| "learning_rate": 2.7081297679457236e-05, | |
| "loss": 2.3052, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 10.720209121704102, | |
| "learning_rate": 2.692152708233292e-05, | |
| "loss": 2.3984, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 13.637066841125488, | |
| "learning_rate": 2.676167749875635e-05, | |
| "loss": 2.8592, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 8.968180656433105, | |
| "learning_rate": 2.6601755499517826e-05, | |
| "loss": 2.4064, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 9.793657302856445, | |
| "learning_rate": 2.6441767658384366e-05, | |
| "loss": 3.0291, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 8.77736759185791, | |
| "learning_rate": 2.628172055182948e-05, | |
| "loss": 2.3978, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 9.85341739654541, | |
| "learning_rate": 2.6121620758762877e-05, | |
| "loss": 2.5431, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 11.554092407226562, | |
| "learning_rate": 2.596147486025996e-05, | |
| "loss": 2.8736, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 8.345808029174805, | |
| "learning_rate": 2.5801289439291388e-05, | |
| "loss": 2.6837, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 9.449942588806152, | |
| "learning_rate": 2.564107108045239e-05, | |
| "loss": 3.1686, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 32.239585876464844, | |
| "learning_rate": 2.5480826369692178e-05, | |
| "loss": 3.0723, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 36.09556579589844, | |
| "learning_rate": 2.5320561894043184e-05, | |
| "loss": 2.6983, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 42.91587829589844, | |
| "learning_rate": 2.5160284241350278e-05, | |
| "loss": 2.6336, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 20.82798194885254, | |
| "learning_rate": 2.5e-05, | |
| "loss": 2.8635, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 7.868647575378418, | |
| "learning_rate": 2.4839715758649724e-05, | |
| "loss": 2.8232, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 12.662830352783203, | |
| "learning_rate": 2.467943810595682e-05, | |
| "loss": 2.8228, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 15.0082368850708, | |
| "learning_rate": 2.4519173630307825e-05, | |
| "loss": 2.6371, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 15.185354232788086, | |
| "learning_rate": 2.4358928919547616e-05, | |
| "loss": 2.6336, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 7.559114456176758, | |
| "learning_rate": 2.419871056070862e-05, | |
| "loss": 2.4209, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 13.428425788879395, | |
| "learning_rate": 2.403852513974004e-05, | |
| "loss": 2.6897, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 38.91737747192383, | |
| "learning_rate": 2.3878379241237136e-05, | |
| "loss": 2.569, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 15.211456298828125, | |
| "learning_rate": 2.3718279448170525e-05, | |
| "loss": 2.4856, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 11.391417503356934, | |
| "learning_rate": 2.3558232341615643e-05, | |
| "loss": 2.5527, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 21.21587371826172, | |
| "learning_rate": 2.339824450048218e-05, | |
| "loss": 3.5899, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 17.66887664794922, | |
| "learning_rate": 2.323832250124365e-05, | |
| "loss": 2.4669, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 10.46285629272461, | |
| "learning_rate": 2.3078472917667092e-05, | |
| "loss": 2.4984, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 14.3226318359375, | |
| "learning_rate": 2.291870232054277e-05, | |
| "loss": 2.8999, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 47.95698547363281, | |
| "learning_rate": 2.2759017277414166e-05, | |
| "loss": 2.7827, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 10.691919326782227, | |
| "learning_rate": 2.2599424352307957e-05, | |
| "loss": 2.9201, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 15.748466491699219, | |
| "learning_rate": 2.243993010546422e-05, | |
| "loss": 2.3433, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 18.585126876831055, | |
| "learning_rate": 2.2280541093066732e-05, | |
| "loss": 2.7896, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 9.304043769836426, | |
| "learning_rate": 2.212126386697352e-05, | |
| "loss": 2.781, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 14.388626098632812, | |
| "learning_rate": 2.196210497444749e-05, | |
| "loss": 2.9678, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 9.608166694641113, | |
| "learning_rate": 2.1803070957887347e-05, | |
| "loss": 3.0607, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 6.090882301330566, | |
| "learning_rate": 2.164416835455862e-05, | |
| "loss": 3.6207, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 13.572842597961426, | |
| "learning_rate": 2.1485403696324936e-05, | |
| "loss": 2.5546, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 7.186910629272461, | |
| "learning_rate": 2.1326783509379554e-05, | |
| "loss": 2.9678, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 50.014766693115234, | |
| "learning_rate": 2.11683143139771e-05, | |
| "loss": 3.1836, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.37592077255249, | |
| "learning_rate": 2.1010002624165527e-05, | |
| "loss": 2.659, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 22.413625717163086, | |
| "learning_rate": 2.0851854947518313e-05, | |
| "loss": 2.6929, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 11.021233558654785, | |
| "learning_rate": 2.069387778486703e-05, | |
| "loss": 2.7117, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 56.379844665527344, | |
| "learning_rate": 2.0536077630034086e-05, | |
| "loss": 3.4033, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 10.377793312072754, | |
| "learning_rate": 2.0378460969565782e-05, | |
| "loss": 2.4798, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 6.888087749481201, | |
| "learning_rate": 2.02210342824657e-05, | |
| "loss": 2.6521, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 6.764181613922119, | |
| "learning_rate": 2.0063804039928324e-05, | |
| "loss": 2.6824, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 55.408729553222656, | |
| "learning_rate": 1.9906776705073114e-05, | |
| "loss": 3.5755, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 8.327556610107422, | |
| "learning_rate": 1.9749958732678767e-05, | |
| "loss": 2.3398, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 9.360859870910645, | |
| "learning_rate": 1.9593356568917913e-05, | |
| "loss": 2.5018, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 32.50775146484375, | |
| "learning_rate": 1.9436976651092144e-05, | |
| "loss": 2.8295, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 23.467443466186523, | |
| "learning_rate": 1.928082540736737e-05, | |
| "loss": 2.6034, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 10.357468605041504, | |
| "learning_rate": 1.9124909256509622e-05, | |
| "loss": 3.0481, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 16.312400817871094, | |
| "learning_rate": 1.8969234607621186e-05, | |
| "loss": 2.9409, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 15.405855178833008, | |
| "learning_rate": 1.8813807859877147e-05, | |
| "loss": 2.6141, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 10.774744987487793, | |
| "learning_rate": 1.865863540226232e-05, | |
| "loss": 2.5919, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 7.001652717590332, | |
| "learning_rate": 1.8503723613308683e-05, | |
| "loss": 3.043, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 32.264766693115234, | |
| "learning_rate": 1.8349078860833123e-05, | |
| "loss": 2.711, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 15.870736122131348, | |
| "learning_rate": 1.8194707501675724e-05, | |
| "loss": 2.4717, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 15.47707748413086, | |
| "learning_rate": 1.8040615881438425e-05, | |
| "loss": 2.9706, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 12.465653419494629, | |
| "learning_rate": 1.7886810334224192e-05, | |
| "loss": 3.0024, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 20.99846839904785, | |
| "learning_rate": 1.7733297182376663e-05, | |
| "loss": 3.3546, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 37.41334533691406, | |
| "learning_rate": 1.7580082736220237e-05, | |
| "loss": 2.924, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 6.230709552764893, | |
| "learning_rate": 1.74271732938007e-05, | |
| "loss": 2.7253, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 15.803852081298828, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 2.8601, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 6.981109142303467, | |
| "learning_rate": 1.7122294549409484e-05, | |
| "loss": 2.6754, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 21.247787475585938, | |
| "learning_rate": 1.6970337779808862e-05, | |
| "loss": 2.5148, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 13.401627540588379, | |
| "learning_rate": 1.6818711078172077e-05, | |
| "loss": 2.5783, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 8.62679386138916, | |
| "learning_rate": 1.666742067727896e-05, | |
| "loss": 2.5611, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 6.567328929901123, | |
| "learning_rate": 1.6516472796085315e-05, | |
| "loss": 2.3179, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 9.707379341125488, | |
| "learning_rate": 1.6365873639467315e-05, | |
| "loss": 2.3441, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 38.05426788330078, | |
| "learning_rate": 1.621562939796643e-05, | |
| "loss": 2.562, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 18.448406219482422, | |
| "learning_rate": 1.6065746247534984e-05, | |
| "loss": 2.6616, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 12.998920440673828, | |
| "learning_rate": 1.5916230349282215e-05, | |
| "loss": 2.9305, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 7.341577053070068, | |
| "learning_rate": 1.5767087849221096e-05, | |
| "loss": 2.6997, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 11.726909637451172, | |
| "learning_rate": 1.561832487801565e-05, | |
| "loss": 2.9281, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 10.877985000610352, | |
| "learning_rate": 1.5469947550728958e-05, | |
| "loss": 3.7181, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 11.845093727111816, | |
| "learning_rate": 1.53219619665718e-05, | |
| "loss": 3.1277, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 11.064024925231934, | |
| "learning_rate": 1.5174374208651912e-05, | |
| "loss": 2.8159, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 12.611387252807617, | |
| "learning_rate": 1.502719034372396e-05, | |
| "loss": 2.9906, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 8.44251537322998, | |
| "learning_rate": 1.4880416421940155e-05, | |
| "loss": 2.7055, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 11.020173072814941, | |
| "learning_rate": 1.4734058476601553e-05, | |
| "loss": 2.7852, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 18.893342971801758, | |
| "learning_rate": 1.458812252391003e-05, | |
| "loss": 2.7677, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 9.38496208190918, | |
| "learning_rate": 1.444261456272101e-05, | |
| "loss": 2.7732, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 6.147873401641846, | |
| "learning_rate": 1.4297540574296869e-05, | |
| "loss": 2.2796, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 18.142108917236328, | |
| "learning_rate": 1.4152906522061048e-05, | |
| "loss": 3.1047, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 16.73578643798828, | |
| "learning_rate": 1.400871835135295e-05, | |
| "loss": 2.9558, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 8.667417526245117, | |
| "learning_rate": 1.386498198918352e-05, | |
| "loss": 2.5587, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 29.769210815429688, | |
| "learning_rate": 1.3721703343991633e-05, | |
| "loss": 3.2146, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 27.924272537231445, | |
| "learning_rate": 1.3578888305401207e-05, | |
| "loss": 2.8355, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 8.13482666015625, | |
| "learning_rate": 1.3436542743979125e-05, | |
| "loss": 2.8218, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 16.816307067871094, | |
| "learning_rate": 1.329467251099386e-05, | |
| "loss": 2.5636, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 10.0423583984375, | |
| "learning_rate": 1.3153283438175034e-05, | |
| "loss": 2.3591, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 10.223281860351562, | |
| "learning_rate": 1.3012381337473656e-05, | |
| "loss": 3.0024, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 16.49390983581543, | |
| "learning_rate": 1.2871972000823196e-05, | |
| "loss": 2.9023, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 7.545689582824707, | |
| "learning_rate": 1.2732061199901562e-05, | |
| "loss": 2.8308, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 16.10993003845215, | |
| "learning_rate": 1.2592654685893757e-05, | |
| "loss": 2.6319, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 17.161388397216797, | |
| "learning_rate": 1.2453758189255568e-05, | |
| "loss": 2.6535, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 9.898162841796875, | |
| "learning_rate": 1.231537741947795e-05, | |
| "loss": 3.3956, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 8.259682655334473, | |
| "learning_rate": 1.217751806485235e-05, | |
| "loss": 2.5027, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 13.10878849029541, | |
| "learning_rate": 1.2040185792236874e-05, | |
| "loss": 2.4352, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.2070722579956055, | |
| "learning_rate": 1.1903386246823361e-05, | |
| "loss": 2.8799, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 12.991480827331543, | |
| "learning_rate": 1.1767125051905315e-05, | |
| "loss": 3.0858, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 10.183439254760742, | |
| "learning_rate": 1.1631407808646758e-05, | |
| "loss": 2.767, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 44.539459228515625, | |
| "learning_rate": 1.1496240095852001e-05, | |
| "loss": 2.9168, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 9.825093269348145, | |
| "learning_rate": 1.1361627469736285e-05, | |
| "loss": 2.6493, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 13.887325286865234, | |
| "learning_rate": 1.122757546369744e-05, | |
| "loss": 2.8159, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 6.512905120849609, | |
| "learning_rate": 1.1094089588088383e-05, | |
| "loss": 3.4574, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 8.677343368530273, | |
| "learning_rate": 1.096117532999063e-05, | |
| "loss": 2.7936, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 29.443038940429688, | |
| "learning_rate": 1.082883815298876e-05, | |
| "loss": 2.7603, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 6.62708044052124, | |
| "learning_rate": 1.0697083496945765e-05, | |
| "loss": 2.6984, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 11.575033187866211, | |
| "learning_rate": 1.0565916777779519e-05, | |
| "loss": 2.7648, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 19.933650970458984, | |
| "learning_rate": 1.0435343387240098e-05, | |
| "loss": 2.9776, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 12.603243827819824, | |
| "learning_rate": 1.0305368692688174e-05, | |
| "loss": 2.4419, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 8.084098815917969, | |
| "learning_rate": 1.0175998036874356e-05, | |
| "loss": 2.7489, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 20.572294235229492, | |
| "learning_rate": 1.0047236737719601e-05, | |
| "loss": 3.0075, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 8.42055606842041, | |
| "learning_rate": 9.919090088096589e-06, | |
| "loss": 2.7706, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 33.90031051635742, | |
| "learning_rate": 9.791563355612172e-06, | |
| "loss": 3.6592, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 8.972926139831543, | |
| "learning_rate": 9.664661782390841e-06, | |
| "loss": 2.4741, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 34.49540328979492, | |
| "learning_rate": 9.538390584859214e-06, | |
| "loss": 2.8379, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 17.665390014648438, | |
| "learning_rate": 9.412754953531663e-06, | |
| "loss": 3.0774, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 24.203062057495117, | |
| "learning_rate": 9.287760052796909e-06, | |
| "loss": 2.6639, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 15.584200859069824, | |
| "learning_rate": 9.163411020705762e-06, | |
| "loss": 2.75, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 9.29335880279541, | |
| "learning_rate": 9.039712968759864e-06, | |
| "loss": 2.9486, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 10.088394165039062, | |
| "learning_rate": 8.916670981701655e-06, | |
| "loss": 2.251, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 12.336640357971191, | |
| "learning_rate": 8.794290117305296e-06, | |
| "loss": 2.9527, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 8.988898277282715, | |
| "learning_rate": 8.672575406168782e-06, | |
| "loss": 2.6327, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 11.5975980758667, | |
| "learning_rate": 8.551531851507186e-06, | |
| "loss": 2.6512, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 23.88221549987793, | |
| "learning_rate": 8.431164428946927e-06, | |
| "loss": 2.6542, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 8.243393898010254, | |
| "learning_rate": 8.3114780863213e-06, | |
| "loss": 3.0325, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 13.184893608093262, | |
| "learning_rate": 8.192477743467078e-06, | |
| "loss": 2.7061, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 7.766141414642334, | |
| "learning_rate": 8.07416829202227e-06, | |
| "loss": 2.2925, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 30.14096450805664, | |
| "learning_rate": 7.956554595225016e-06, | |
| "loss": 3.4068, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 16.042314529418945, | |
| "learning_rate": 7.839641487713745e-06, | |
| "loss": 2.388, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 16.486698150634766, | |
| "learning_rate": 7.723433775328384e-06, | |
| "loss": 3.549, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 10.384564399719238, | |
| "learning_rate": 7.607936234912841e-06, | |
| "loss": 3.186, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 13.724809646606445, | |
| "learning_rate": 7.493153614118634e-06, | |
| "loss": 2.6417, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 8.79148006439209, | |
| "learning_rate": 7.379090631209712e-06, | |
| "loss": 2.6296, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 14.533072471618652, | |
| "learning_rate": 7.265751974868554e-06, | |
| "loss": 2.5174, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 11.014676094055176, | |
| "learning_rate": 7.153142304003418e-06, | |
| "loss": 2.5473, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 14.28827953338623, | |
| "learning_rate": 7.041266247556813e-06, | |
| "loss": 2.8404, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 13.483999252319336, | |
| "learning_rate": 6.930128404315214e-06, | |
| "loss": 2.6145, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 22.0334529876709, | |
| "learning_rate": 6.819733342720066e-06, | |
| "loss": 2.7059, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 42.69429397583008, | |
| "learning_rate": 6.7100856006799665e-06, | |
| "loss": 3.0099, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 9.883504867553711, | |
| "learning_rate": 6.601189685384126e-06, | |
| "loss": 2.9242, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 17.86440658569336, | |
| "learning_rate": 6.493050073117116e-06, | |
| "loss": 3.022, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 27.365087509155273, | |
| "learning_rate": 6.385671209074828e-06, | |
| "loss": 2.9742, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 13.641215324401855, | |
| "learning_rate": 6.279057507181796e-06, | |
| "loss": 2.4529, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 8.5147123336792, | |
| "learning_rate": 6.173213349909729e-06, | |
| "loss": 2.5926, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 13.009632110595703, | |
| "learning_rate": 6.068143088097372e-06, | |
| "loss": 2.4284, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 11.745450973510742, | |
| "learning_rate": 5.9638510407716394e-06, | |
| "loss": 2.7295, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 7.102616310119629, | |
| "learning_rate": 5.860341494970131e-06, | |
| "loss": 2.3844, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 19.79552459716797, | |
| "learning_rate": 5.757618705564849e-06, | |
| "loss": 2.6637, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 10.288678169250488, | |
| "learning_rate": 5.655686895087329e-06, | |
| "loss": 2.6376, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 5.515965938568115, | |
| "learning_rate": 5.554550253555066e-06, | |
| "loss": 2.7672, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 22.45775604248047, | |
| "learning_rate": 5.454212938299255e-06, | |
| "loss": 2.9637, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 8.804770469665527, | |
| "learning_rate": 5.354679073793942e-06, | |
| "loss": 2.4708, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 14.247252464294434, | |
| "learning_rate": 5.255952751486443e-06, | |
| "loss": 2.6551, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 7.674140930175781, | |
| "learning_rate": 5.158038029629195e-06, | |
| "loss": 2.7633, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 9.93137264251709, | |
| "learning_rate": 5.060938933112891e-06, | |
| "loss": 2.7034, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 11.792275428771973, | |
| "learning_rate": 4.9646594533010875e-06, | |
| "loss": 2.8564, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 13.885851860046387, | |
| "learning_rate": 4.869203547866097e-06, | |
| "loss": 2.6809, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 21.813217163085938, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 3.0319, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 12.183605194091797, | |
| "learning_rate": 4.680778121384935e-06, | |
| "loss": 2.5685, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 26.5369930267334, | |
| "learning_rate": 4.587816345770032e-06, | |
| "loss": 2.9539, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 5.17782735824585, | |
| "learning_rate": 4.495693635076101e-06, | |
| "loss": 2.3644, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 21.22880744934082, | |
| "learning_rate": 4.404413776106958e-06, | |
| "loss": 3.0965, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 5.561256408691406, | |
| "learning_rate": 4.313980521020092e-06, | |
| "loss": 3.032, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 7.701003074645996, | |
| "learning_rate": 4.224397587172402e-06, | |
| "loss": 2.7757, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 17.345849990844727, | |
| "learning_rate": 4.135668656967434e-06, | |
| "loss": 2.3689, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 8.013896942138672, | |
| "learning_rate": 4.047797377703985e-06, | |
| "loss": 2.512, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 6.709656715393066, | |
| "learning_rate": 3.9607873614261715e-06, | |
| "loss": 3.0421, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 13.18295669555664, | |
| "learning_rate": 3.8746421847749765e-06, | |
| "loss": 2.4467, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 9.89919662475586, | |
| "learning_rate": 3.789365388841193e-06, | |
| "loss": 2.322, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 20.111215591430664, | |
| "learning_rate": 3.7049604790198976e-06, | |
| "loss": 3.2308, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 9.88414192199707, | |
| "learning_rate": 3.621430924866348e-06, | |
| "loss": 2.5939, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 8.166717529296875, | |
| "learning_rate": 3.5387801599533475e-06, | |
| "loss": 2.9439, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 31.1898193359375, | |
| "learning_rate": 3.4570115817301243e-06, | |
| "loss": 3.2292, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 26.66686248779297, | |
| "learning_rate": 3.3761285513826625e-06, | |
| "loss": 2.9332, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 4.934303283691406, | |
| "learning_rate": 3.296134393695538e-06, | |
| "loss": 2.7759, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 11.340413093566895, | |
| "learning_rate": 3.217032396915265e-06, | |
| "loss": 2.3886, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 10.123784065246582, | |
| "learning_rate": 3.1388258126151093e-06, | |
| "loss": 2.6871, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 9.508092880249023, | |
| "learning_rate": 3.06151785556143e-06, | |
| "loss": 3.0332, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 6.981659412384033, | |
| "learning_rate": 2.98511170358155e-06, | |
| "loss": 2.6556, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 7.004157066345215, | |
| "learning_rate": 2.9096104974331184e-06, | |
| "loss": 2.7863, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 15.677950859069824, | |
| "learning_rate": 2.8350173406749973e-06, | |
| "loss": 2.7697, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 7.333318710327148, | |
| "learning_rate": 2.7613352995397078e-06, | |
| "loss": 2.6726, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 5.832099437713623, | |
| "learning_rate": 2.688567402807357e-06, | |
| "loss": 2.8863, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 10.78770637512207, | |
| "learning_rate": 2.6167166416811746e-06, | |
| "loss": 2.8584, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 16.921499252319336, | |
| "learning_rate": 2.545785969664524e-06, | |
| "loss": 2.9597, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 8.275824546813965, | |
| "learning_rate": 2.475778302439524e-06, | |
| "loss": 2.1545, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 20.255998611450195, | |
| "learning_rate": 2.4066965177471645e-06, | |
| "loss": 2.5138, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 6.863336086273193, | |
| "learning_rate": 2.338543455269046e-06, | |
| "loss": 2.5736, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 10.660894393920898, | |
| "learning_rate": 2.271321916510627e-06, | |
| "loss": 2.5524, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 13.18586254119873, | |
| "learning_rate": 2.205034664686076e-06, | |
| "loss": 2.5716, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 11.016261100769043, | |
| "learning_rate": 2.1396844246046903e-06, | |
| "loss": 3.3971, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 23.928808212280273, | |
| "learning_rate": 2.075273882558873e-06, | |
| "loss": 2.7627, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 8.036683082580566, | |
| "learning_rate": 2.0118056862137357e-06, | |
| "loss": 2.7452, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 15.620336532592773, | |
| "learning_rate": 1.949282444498238e-06, | |
| "loss": 2.7892, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 5.829482078552246, | |
| "learning_rate": 1.8877067274979648e-06, | |
| "loss": 2.7427, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 24.735841751098633, | |
| "learning_rate": 1.827081066349459e-06, | |
| "loss": 2.3911, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 15.522488594055176, | |
| "learning_rate": 1.767407953136202e-06, | |
| "loss": 2.7896, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 10.789400100708008, | |
| "learning_rate": 1.7086898407861485e-06, | |
| "loss": 2.6058, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 12.961084365844727, | |
| "learning_rate": 1.6509291429709223e-06, | |
| "loss": 2.1545, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 42.19524002075195, | |
| "learning_rate": 1.59412823400657e-06, | |
| "loss": 2.7638, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 11.88717269897461, | |
| "learning_rate": 1.538289448755989e-06, | |
| "loss": 3.258, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 9.746241569519043, | |
| "learning_rate": 1.483415082532938e-06, | |
| "loss": 2.4654, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 8.284168243408203, | |
| "learning_rate": 1.4295073910076757e-06, | |
| "loss": 2.8934, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 6.290571212768555, | |
| "learning_rate": 1.3765685901142716e-06, | |
| "loss": 2.8615, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 9.758255004882812, | |
| "learning_rate": 1.3246008559594709e-06, | |
| "loss": 2.4915, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 64.08142852783203, | |
| "learning_rate": 1.273606324733284e-06, | |
| "loss": 2.6644, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 6.796569347381592, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 3.0118, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 48.14067459106445, | |
| "learning_rate": 1.1745452157178206e-06, | |
| "loss": 2.5428, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 78.56604766845703, | |
| "learning_rate": 1.1264827099427417e-06, | |
| "loss": 3.2121, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 8.145052909851074, | |
| "learning_rate": 1.0794015509572818e-06, | |
| "loss": 2.7715, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 9.410415649414062, | |
| "learning_rate": 1.0333036740834856e-06, | |
| "loss": 2.9311, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 26.370338439941406, | |
| "learning_rate": 9.881909742245177e-07, | |
| "loss": 2.8867, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 19.1849365234375, | |
| "learning_rate": 9.440653057867815e-07, | |
| "loss": 2.559, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 25.568355560302734, | |
| "learning_rate": 9.009284826036691e-07, | |
| "loss": 2.5787, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 14.307110786437988, | |
| "learning_rate": 8.587822778610283e-07, | |
| "loss": 2.8058, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 25.297014236450195, | |
| "learning_rate": 8.176284240242638e-07, | |
| "loss": 2.8921, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 19.239059448242188, | |
| "learning_rate": 7.774686127671183e-07, | |
| "loss": 2.6122, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 11.51448917388916, | |
| "learning_rate": 7.383044949021339e-07, | |
| "loss": 2.2361, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 6.253133773803711, | |
| "learning_rate": 7.00137680312804e-07, | |
| "loss": 2.5475, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 17.918170928955078, | |
| "learning_rate": 6.62969737887384e-07, | |
| "loss": 2.8589, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 15.065885543823242, | |
| "learning_rate": 6.268021954544096e-07, | |
| "loss": 2.9153, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 13.842984199523926, | |
| "learning_rate": 5.916365397198975e-07, | |
| "loss": 2.894, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 10.621698379516602, | |
| "learning_rate": 5.574742162062163e-07, | |
| "loss": 3.1294, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 13.219593048095703, | |
| "learning_rate": 5.243166291926782e-07, | |
| "loss": 2.6543, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 11.599324226379395, | |
| "learning_rate": 4.921651416578188e-07, | |
| "loss": 2.3677, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 8.731278419494629, | |
| "learning_rate": 4.6102107522336403e-07, | |
| "loss": 2.7634, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 43.908077239990234, | |
| "learning_rate": 4.308857100999042e-07, | |
| "loss": 2.8106, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 9.233963966369629, | |
| "learning_rate": 4.0176028503425835e-07, | |
| "loss": 2.5252, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 13.496291160583496, | |
| "learning_rate": 3.7364599725858153e-07, | |
| "loss": 2.2908, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 11.278969764709473, | |
| "learning_rate": 3.465440024411265e-07, | |
| "loss": 2.5045, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 13.121251106262207, | |
| "learning_rate": 3.204554146387456e-07, | |
| "loss": 2.4661, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 17.475078582763672, | |
| "learning_rate": 2.9538130625110796e-07, | |
| "loss": 2.9907, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 6.753970146179199, | |
| "learning_rate": 2.7132270797659563e-07, | |
| "loss": 2.4326, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 18.052387237548828, | |
| "learning_rate": 2.482806087699546e-07, | |
| "loss": 3.0056, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 11.094266891479492, | |
| "learning_rate": 2.262559558016325e-07, | |
| "loss": 2.7058, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "total_flos": 1.3202552777146368e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |