{ "best_global_step": null, "best_metric": 2.056363105773926, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2022, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001483679525222552, "grad_norm": 8.939897537231445, "learning_rate": 0.0, "loss": 0.8632, "step": 1 }, { "epoch": 0.004451038575667656, "grad_norm": 9.648813247680664, "learning_rate": 2.8314346251117696e-06, "loss": 0.801, "step": 3 }, { "epoch": 0.008902077151335312, "grad_norm": 6.610266208648682, "learning_rate": 4.61787097538723e-06, "loss": 0.8504, "step": 6 }, { "epoch": 0.013353115727002967, "grad_norm": 6.382794380187988, "learning_rate": 5.662869250223539e-06, "loss": 0.8868, "step": 9 }, { "epoch": 0.017804154302670624, "grad_norm": 11.73421859741211, "learning_rate": 6.40430732566269e-06, "loss": 0.7982, "step": 12 }, { "epoch": 0.02225519287833828, "grad_norm": 9.07333755493164, "learning_rate": 6.979411376544402e-06, "loss": 0.7539, "step": 15 }, { "epoch": 0.026706231454005934, "grad_norm": 5.257121562957764, "learning_rate": 7.449305600498998e-06, "loss": 0.7841, "step": 18 }, { "epoch": 0.03115727002967359, "grad_norm": 10.231422424316406, "learning_rate": 7.846595506000205e-06, "loss": 0.773, "step": 21 }, { "epoch": 0.03560830860534125, "grad_norm": 6.98926305770874, "learning_rate": 8.190743675938152e-06, "loss": 0.794, "step": 24 }, { "epoch": 0.040059347181008904, "grad_norm": 11.664831161499023, "learning_rate": 8.494303875335309e-06, "loss": 0.7717, "step": 27 }, { "epoch": 0.04451038575667656, "grad_norm": 7.91517448425293, "learning_rate": 8.765847726819862e-06, "loss": 0.7704, "step": 30 }, { "epoch": 0.04896142433234421, "grad_norm": 4.517396926879883, "learning_rate": 9.01148901993771e-06, "loss": 0.7508, "step": 33 }, { "epoch": 0.05341246290801187, "grad_norm": 6.36966609954834, "learning_rate": 9.23574195077446e-06, "loss": 0.8119, "step": 36 }, { "epoch": 0.057863501483679525, "grad_norm": 11.161980628967285, "learning_rate": 9.442034649602095e-06, "loss": 0.8007, "step": 39 }, { "epoch": 0.06231454005934718, "grad_norm": 8.65380859375, "learning_rate": 9.633031856275666e-06, "loss": 0.7474, "step": 42 }, { "epoch": 0.06676557863501484, "grad_norm": 8.394684791564941, "learning_rate": 9.81084600165617e-06, "loss": 0.7685, "step": 45 }, { "epoch": 0.0712166172106825, "grad_norm": 9.330730438232422, "learning_rate": 9.977180026213612e-06, "loss": 0.7716, "step": 48 }, { "epoch": 0.07566765578635015, "grad_norm": 6.538848400115967, "learning_rate": 1.013342682512159e-05, "loss": 0.8483, "step": 51 }, { "epoch": 0.08011869436201781, "grad_norm": 5.0795793533325195, "learning_rate": 1.0280740225610769e-05, "loss": 0.8217, "step": 54 }, { "epoch": 0.08456973293768547, "grad_norm": 4.957094192504883, "learning_rate": 1.042008674846264e-05, "loss": 0.8032, "step": 57 }, { "epoch": 0.08902077151335312, "grad_norm": 11.510953903198242, "learning_rate": 1.0552284077095323e-05, "loss": 0.8015, "step": 60 }, { "epoch": 0.09347181008902077, "grad_norm": 5.8856096267700195, "learning_rate": 1.0678030131111975e-05, "loss": 0.7644, "step": 63 }, { "epoch": 0.09792284866468842, "grad_norm": 6.385069847106934, "learning_rate": 1.0797925370213172e-05, "loss": 0.782, "step": 66 }, { "epoch": 0.10237388724035608, "grad_norm": 7.3922295570373535, "learning_rate": 1.0912490136135183e-05, "loss": 0.8079, "step": 69 }, { "epoch": 0.10682492581602374, "grad_norm": 6.742562770843506, "learning_rate": 1.102217830104992e-05, "loss": 0.7727, "step": 72 }, { "epoch": 0.11127596439169139, "grad_norm": 10.546209335327148, "learning_rate": 1.1127388127977034e-05, "loss": 0.8124, "step": 75 }, { "epoch": 0.11572700296735905, "grad_norm": 10.506328582763672, "learning_rate": 1.1228470999877556e-05, "loss": 0.7427, "step": 78 }, { "epoch": 0.1201780415430267, "grad_norm": 8.066429138183594, "learning_rate": 1.1325738500447079e-05, "loss": 0.7873, "step": 81 }, { "epoch": 0.12462908011869436, "grad_norm": 9.615484237670898, "learning_rate": 1.1419468206551126e-05, "loss": 0.8364, "step": 84 }, { "epoch": 0.129080118694362, "grad_norm": 14.365260124206543, "learning_rate": 1.150990846375502e-05, "loss": 0.7708, "step": 87 }, { "epoch": 0.13353115727002968, "grad_norm": 8.187057495117188, "learning_rate": 1.1597282351931633e-05, "loss": 0.7984, "step": 90 }, { "epoch": 0.13798219584569732, "grad_norm": 14.292377471923828, "learning_rate": 1.1681791000387452e-05, "loss": 0.7983, "step": 93 }, { "epoch": 0.142433234421365, "grad_norm": 12.673680305480957, "learning_rate": 1.1763616376489071e-05, "loss": 0.8131, "step": 96 }, { "epoch": 0.14688427299703263, "grad_norm": 5.767789363861084, "learning_rate": 1.1842923645049482e-05, "loss": 0.8275, "step": 99 }, { "epoch": 0.1513353115727003, "grad_norm": 10.027091026306152, "learning_rate": 1.1919863175397048e-05, "loss": 0.7671, "step": 102 }, { "epoch": 0.15578635014836795, "grad_norm": 5.067603588104248, "learning_rate": 1.1994572257432837e-05, "loss": 0.7883, "step": 105 }, { "epoch": 0.16023738872403562, "grad_norm": 12.904479026794434, "learning_rate": 1.206717657588623e-05, "loss": 0.7876, "step": 108 }, { "epoch": 0.16468842729970326, "grad_norm": 8.401494979858398, "learning_rate": 1.2137791482536164e-05, "loss": 0.8008, "step": 111 }, { "epoch": 0.16913946587537093, "grad_norm": 11.146966934204102, "learning_rate": 1.2206523098738102e-05, "loss": 0.7833, "step": 114 }, { "epoch": 0.17359050445103857, "grad_norm": 9.299595832824707, "learning_rate": 1.2273469274713866e-05, "loss": 0.7752, "step": 117 }, { "epoch": 0.17804154302670624, "grad_norm": 9.26494312286377, "learning_rate": 1.2338720427370783e-05, "loss": 0.8126, "step": 120 }, { "epoch": 0.1824925816023739, "grad_norm": 9.886981964111328, "learning_rate": 1.2402360274652675e-05, "loss": 0.7478, "step": 123 }, { "epoch": 0.18694362017804153, "grad_norm": 11.463534355163574, "learning_rate": 1.2464466481387436e-05, "loss": 0.7334, "step": 126 }, { "epoch": 0.1913946587537092, "grad_norm": 7.69536018371582, "learning_rate": 1.252511122913015e-05, "loss": 0.7756, "step": 129 }, { "epoch": 0.19584569732937684, "grad_norm": 9.41036605834961, "learning_rate": 1.2584361720488632e-05, "loss": 0.7992, "step": 132 }, { "epoch": 0.20029673590504452, "grad_norm": 6.2021803855896, "learning_rate": 1.264228062676794e-05, "loss": 0.7504, "step": 135 }, { "epoch": 0.20474777448071216, "grad_norm": 6.035690784454346, "learning_rate": 1.2698926486410644e-05, "loss": 0.8319, "step": 138 }, { "epoch": 0.20919881305637983, "grad_norm": 6.217399597167969, "learning_rate": 1.275435406058353e-05, "loss": 0.8151, "step": 141 }, { "epoch": 0.21364985163204747, "grad_norm": 10.224953651428223, "learning_rate": 1.280861465132538e-05, "loss": 0.8005, "step": 144 }, { "epoch": 0.21810089020771514, "grad_norm": 5.470182418823242, "learning_rate": 1.286175638688864e-05, "loss": 0.8147, "step": 147 }, { "epoch": 0.22255192878338279, "grad_norm": 6.979781627655029, "learning_rate": 1.2913824478252495e-05, "loss": 0.7623, "step": 150 }, { "epoch": 0.22700296735905046, "grad_norm": 7.8118414878845215, "learning_rate": 1.2964861450233358e-05, "loss": 0.7851, "step": 153 }, { "epoch": 0.2314540059347181, "grad_norm": 8.010553359985352, "learning_rate": 1.3014907350153016e-05, "loss": 0.8243, "step": 156 }, { "epoch": 0.23590504451038577, "grad_norm": 6.6702470779418945, "learning_rate": 1.3063999936629808e-05, "loss": 0.8348, "step": 159 }, { "epoch": 0.2403560830860534, "grad_norm": 6.347877025604248, "learning_rate": 1.3112174850722537e-05, "loss": 0.8093, "step": 162 }, { "epoch": 0.24480712166172106, "grad_norm": 8.773872375488281, "learning_rate": 1.3159465771370344e-05, "loss": 0.8424, "step": 165 }, { "epoch": 0.24925816023738873, "grad_norm": 9.952154159545898, "learning_rate": 1.3205904556826587e-05, "loss": 0.8068, "step": 168 }, { "epoch": 0.25370919881305637, "grad_norm": 4.658379554748535, "learning_rate": 1.325152137357441e-05, "loss": 0.8352, "step": 171 }, { "epoch": 0.258160237388724, "grad_norm": 8.045439720153809, "learning_rate": 1.329634481403048e-05, "loss": 0.7586, "step": 174 }, { "epoch": 0.2626112759643917, "grad_norm": 13.365344047546387, "learning_rate": 1.3340402004187042e-05, "loss": 0.7838, "step": 177 }, { "epoch": 0.26706231454005935, "grad_norm": 10.028172492980957, "learning_rate": 1.3383718702207093e-05, "loss": 0.8142, "step": 180 }, { "epoch": 0.271513353115727, "grad_norm": 7.544151782989502, "learning_rate": 1.3426319388870015e-05, "loss": 0.8386, "step": 183 }, { "epoch": 0.27596439169139464, "grad_norm": 11.747024536132812, "learning_rate": 1.3468227350662914e-05, "loss": 0.7884, "step": 186 }, { "epoch": 0.28041543026706234, "grad_norm": 13.382097244262695, "learning_rate": 1.3509464756223744e-05, "loss": 0.7878, "step": 189 }, { "epoch": 0.28486646884273, "grad_norm": 6.687026023864746, "learning_rate": 1.3550052726764533e-05, "loss": 0.7692, "step": 192 }, { "epoch": 0.2893175074183976, "grad_norm": 5.648706912994385, "learning_rate": 1.3590011401034729e-05, "loss": 0.8231, "step": 195 }, { "epoch": 0.29376854599406527, "grad_norm": 12.061844825744629, "learning_rate": 1.3629359995324941e-05, "loss": 0.7478, "step": 198 }, { "epoch": 0.29821958456973297, "grad_norm": 7.498600006103516, "learning_rate": 1.3668116858958576e-05, "loss": 0.7725, "step": 201 }, { "epoch": 0.3026706231454006, "grad_norm": 9.396763801574707, "learning_rate": 1.370629952567251e-05, "loss": 0.7833, "step": 204 }, { "epoch": 0.30712166172106825, "grad_norm": 8.891100883483887, "learning_rate": 1.3743924761246951e-05, "loss": 0.8031, "step": 207 }, { "epoch": 0.3115727002967359, "grad_norm": 4.5692830085754395, "learning_rate": 1.3781008607708299e-05, "loss": 0.7857, "step": 210 }, { "epoch": 0.31602373887240354, "grad_norm": 6.108200550079346, "learning_rate": 1.381756642439674e-05, "loss": 0.7977, "step": 213 }, { "epoch": 0.32047477744807124, "grad_norm": 11.626425743103027, "learning_rate": 1.385361292616169e-05, "loss": 0.7656, "step": 216 }, { "epoch": 0.3249258160237389, "grad_norm": 7.599254131317139, "learning_rate": 1.38891622189228e-05, "loss": 0.8164, "step": 219 }, { "epoch": 0.3293768545994065, "grad_norm": 13.067089080810547, "learning_rate": 1.3924227832811623e-05, "loss": 0.8302, "step": 222 }, { "epoch": 0.33382789317507416, "grad_norm": 8.126757621765137, "learning_rate": 1.3958822753088804e-05, "loss": 0.772, "step": 225 }, { "epoch": 0.33827893175074186, "grad_norm": 4.585634231567383, "learning_rate": 1.3992959449013562e-05, "loss": 0.8119, "step": 228 }, { "epoch": 0.3427299703264095, "grad_norm": 7.228199481964111, "learning_rate": 1.4026649900826146e-05, "loss": 0.7462, "step": 231 }, { "epoch": 0.34718100890207715, "grad_norm": 8.683415412902832, "learning_rate": 1.4059905624989326e-05, "loss": 0.7746, "step": 234 }, { "epoch": 0.3516320474777448, "grad_norm": 7.1446990966796875, "learning_rate": 1.4092737697821986e-05, "loss": 0.9112, "step": 237 }, { "epoch": 0.3560830860534125, "grad_norm": 7.082102298736572, "learning_rate": 1.4125156777646244e-05, "loss": 0.7838, "step": 240 }, { "epoch": 0.36053412462908013, "grad_norm": 4.828745365142822, "learning_rate": 1.4157173125558845e-05, "loss": 0.806, "step": 243 }, { "epoch": 0.3649851632047478, "grad_norm": 9.047410011291504, "learning_rate": 1.4188796624928136e-05, "loss": 0.8594, "step": 246 }, { "epoch": 0.3694362017804154, "grad_norm": 4.606753349304199, "learning_rate": 1.4220036799709316e-05, "loss": 0.7615, "step": 249 }, { "epoch": 0.37388724035608306, "grad_norm": 6.191976547241211, "learning_rate": 1.4250902831662896e-05, "loss": 0.8045, "step": 252 }, { "epoch": 0.37833827893175076, "grad_norm": 5.661036491394043, "learning_rate": 1.4281403576554221e-05, "loss": 0.8259, "step": 255 }, { "epoch": 0.3827893175074184, "grad_norm": 11.667348861694336, "learning_rate": 1.4311547579405614e-05, "loss": 0.8555, "step": 258 }, { "epoch": 0.38724035608308605, "grad_norm": 7.464338302612305, "learning_rate": 1.4341343088866789e-05, "loss": 0.8525, "step": 261 }, { "epoch": 0.3916913946587537, "grad_norm": 4.3380584716796875, "learning_rate": 1.4370798070764093e-05, "loss": 0.7874, "step": 264 }, { "epoch": 0.3961424332344214, "grad_norm": 9.564691543579102, "learning_rate": 1.4399920220884169e-05, "loss": 0.8047, "step": 267 }, { "epoch": 0.40059347181008903, "grad_norm": 8.386067390441895, "learning_rate": 1.4428716977043402e-05, "loss": 0.8484, "step": 270 }, { "epoch": 0.4050445103857567, "grad_norm": 6.571981430053711, "learning_rate": 1.4457195530490532e-05, "loss": 0.8236, "step": 273 }, { "epoch": 0.4094955489614243, "grad_norm": 4.435903549194336, "learning_rate": 1.4485362836686102e-05, "loss": 0.8457, "step": 276 }, { "epoch": 0.413946587537092, "grad_norm": 6.691802024841309, "learning_rate": 1.451322562549922e-05, "loss": 0.8016, "step": 279 }, { "epoch": 0.41839762611275966, "grad_norm": 15.839409828186035, "learning_rate": 1.454079041085899e-05, "loss": 0.845, "step": 282 }, { "epoch": 0.4228486646884273, "grad_norm": 7.598920822143555, "learning_rate": 1.4568063499895273e-05, "loss": 0.7626, "step": 285 }, { "epoch": 0.42729970326409494, "grad_norm": 10.305091857910156, "learning_rate": 1.4595051001600841e-05, "loss": 0.7926, "step": 288 }, { "epoch": 0.4317507418397626, "grad_norm": 5.350500106811523, "learning_rate": 1.4621758835044685e-05, "loss": 0.8092, "step": 291 }, { "epoch": 0.4362017804154303, "grad_norm": 18.546098709106445, "learning_rate": 1.4648192737164102e-05, "loss": 0.815, "step": 294 }, { "epoch": 0.4406528189910979, "grad_norm": 8.379362106323242, "learning_rate": 1.4674358270161251e-05, "loss": 0.823, "step": 297 }, { "epoch": 0.44510385756676557, "grad_norm": 8.076055526733398, "learning_rate": 1.4700260828527957e-05, "loss": 0.8321, "step": 300 }, { "epoch": 0.4495548961424332, "grad_norm": 16.41190528869629, "learning_rate": 1.4725905645721047e-05, "loss": 0.8191, "step": 303 }, { "epoch": 0.4540059347181009, "grad_norm": 6.45059871673584, "learning_rate": 1.475129780050882e-05, "loss": 0.7658, "step": 306 }, { "epoch": 0.45845697329376855, "grad_norm": 10.349370002746582, "learning_rate": 1.4776442223007901e-05, "loss": 0.7575, "step": 309 }, { "epoch": 0.4629080118694362, "grad_norm": 3.693134069442749, "learning_rate": 1.4801343700428479e-05, "loss": 0.8456, "step": 312 }, { "epoch": 0.46735905044510384, "grad_norm": 9.323659896850586, "learning_rate": 1.4826006882544607e-05, "loss": 0.8387, "step": 315 }, { "epoch": 0.47181008902077154, "grad_norm": 7.65170431137085, "learning_rate": 1.4850436286905268e-05, "loss": 0.8939, "step": 318 }, { "epoch": 0.4762611275964392, "grad_norm": 15.972323417663574, "learning_rate": 1.4874636303800742e-05, "loss": 0.8639, "step": 321 }, { "epoch": 0.4807121661721068, "grad_norm": 22.23072052001953, "learning_rate": 1.4898611200997996e-05, "loss": 0.8339, "step": 324 }, { "epoch": 0.48516320474777447, "grad_norm": 6.0545430183410645, "learning_rate": 1.4922365128257845e-05, "loss": 0.8315, "step": 327 }, { "epoch": 0.4896142433234421, "grad_norm": 15.703563690185547, "learning_rate": 1.4945902121645804e-05, "loss": 0.8343, "step": 330 }, { "epoch": 0.4940652818991098, "grad_norm": 9.006149291992188, "learning_rate": 1.4969226107647933e-05, "loss": 0.9232, "step": 333 }, { "epoch": 0.49851632047477745, "grad_norm": 9.227169036865234, "learning_rate": 1.4992340907102047e-05, "loss": 0.8832, "step": 336 }, { "epoch": 0.5029673590504451, "grad_norm": 13.163078308105469, "learning_rate": 1.5e-05, "loss": 0.8781, "step": 339 }, { "epoch": 0.5074183976261127, "grad_norm": 7.369640350341797, "learning_rate": 1.5e-05, "loss": 0.8778, "step": 342 }, { "epoch": 0.5118694362017804, "grad_norm": 11.006775856018066, "learning_rate": 1.5e-05, "loss": 0.8643, "step": 345 }, { "epoch": 0.516320474777448, "grad_norm": 3.54710054397583, "learning_rate": 1.5e-05, "loss": 0.843, "step": 348 }, { "epoch": 0.5207715133531158, "grad_norm": 9.564830780029297, "learning_rate": 1.5e-05, "loss": 0.8245, "step": 351 }, { "epoch": 0.5252225519287834, "grad_norm": 8.431432723999023, "learning_rate": 1.5e-05, "loss": 0.886, "step": 354 }, { "epoch": 0.5296735905044511, "grad_norm": 7.211839199066162, "learning_rate": 1.5e-05, "loss": 0.8578, "step": 357 }, { "epoch": 0.5341246290801187, "grad_norm": 7.788987159729004, "learning_rate": 1.5e-05, "loss": 0.872, "step": 360 }, { "epoch": 0.5385756676557863, "grad_norm": 4.075163841247559, "learning_rate": 1.5e-05, "loss": 0.8802, "step": 363 }, { "epoch": 0.543026706231454, "grad_norm": 13.805707931518555, "learning_rate": 1.5e-05, "loss": 0.8325, "step": 366 }, { "epoch": 0.5474777448071216, "grad_norm": 7.167026519775391, "learning_rate": 1.5e-05, "loss": 0.9119, "step": 369 }, { "epoch": 0.5519287833827893, "grad_norm": 8.409590721130371, "learning_rate": 1.5e-05, "loss": 0.8572, "step": 372 }, { "epoch": 0.5563798219584569, "grad_norm": 11.929038047790527, "learning_rate": 1.5e-05, "loss": 0.8327, "step": 375 }, { "epoch": 0.5608308605341247, "grad_norm": 4.433465480804443, "learning_rate": 1.5e-05, "loss": 0.7959, "step": 378 }, { "epoch": 0.5652818991097923, "grad_norm": 7.223580360412598, "learning_rate": 1.5e-05, "loss": 0.8157, "step": 381 }, { "epoch": 0.56973293768546, "grad_norm": 6.028378009796143, "learning_rate": 1.5e-05, "loss": 0.8469, "step": 384 }, { "epoch": 0.5741839762611276, "grad_norm": 10.804591178894043, "learning_rate": 1.5e-05, "loss": 0.8891, "step": 387 }, { "epoch": 0.5786350148367952, "grad_norm": 10.0234956741333, "learning_rate": 1.5e-05, "loss": 0.906, "step": 390 }, { "epoch": 0.5830860534124629, "grad_norm": 4.883424758911133, "learning_rate": 1.5e-05, "loss": 0.8422, "step": 393 }, { "epoch": 0.5875370919881305, "grad_norm": 8.042715072631836, "learning_rate": 1.5e-05, "loss": 0.8673, "step": 396 }, { "epoch": 0.5919881305637982, "grad_norm": 6.206501007080078, "learning_rate": 1.5e-05, "loss": 0.8115, "step": 399 }, { "epoch": 0.5964391691394659, "grad_norm": 14.539153099060059, "learning_rate": 1.5e-05, "loss": 0.8752, "step": 402 }, { "epoch": 0.6008902077151336, "grad_norm": 9.136768341064453, "learning_rate": 1.5e-05, "loss": 0.8752, "step": 405 }, { "epoch": 0.6053412462908012, "grad_norm": 4.936409950256348, "learning_rate": 1.5e-05, "loss": 0.8807, "step": 408 }, { "epoch": 0.6097922848664689, "grad_norm": 12.717706680297852, "learning_rate": 1.5e-05, "loss": 0.8133, "step": 411 }, { "epoch": 0.6142433234421365, "grad_norm": 13.034161567687988, "learning_rate": 1.5e-05, "loss": 0.8437, "step": 414 }, { "epoch": 0.6186943620178041, "grad_norm": 4.76663064956665, "learning_rate": 1.5e-05, "loss": 0.8141, "step": 417 }, { "epoch": 0.6231454005934718, "grad_norm": 6.521324634552002, "learning_rate": 1.5e-05, "loss": 0.8687, "step": 420 }, { "epoch": 0.6275964391691394, "grad_norm": 19.489913940429688, "learning_rate": 1.5e-05, "loss": 0.8154, "step": 423 }, { "epoch": 0.6320474777448071, "grad_norm": 13.211241722106934, "learning_rate": 1.5e-05, "loss": 0.8417, "step": 426 }, { "epoch": 0.6364985163204748, "grad_norm": 8.362677574157715, "learning_rate": 1.5e-05, "loss": 0.8387, "step": 429 }, { "epoch": 0.6409495548961425, "grad_norm": 13.372685432434082, "learning_rate": 1.5e-05, "loss": 0.8563, "step": 432 }, { "epoch": 0.6454005934718101, "grad_norm": 4.561835765838623, "learning_rate": 1.5e-05, "loss": 0.8571, "step": 435 }, { "epoch": 0.6498516320474778, "grad_norm": 12.67446231842041, "learning_rate": 1.5e-05, "loss": 0.8822, "step": 438 }, { "epoch": 0.6543026706231454, "grad_norm": 11.653807640075684, "learning_rate": 1.5e-05, "loss": 0.8925, "step": 441 }, { "epoch": 0.658753709198813, "grad_norm": 9.325271606445312, "learning_rate": 1.5e-05, "loss": 0.7984, "step": 444 }, { "epoch": 0.6632047477744807, "grad_norm": 5.574127674102783, "learning_rate": 1.5e-05, "loss": 0.8253, "step": 447 }, { "epoch": 0.6676557863501483, "grad_norm": 5.725862979888916, "learning_rate": 1.5e-05, "loss": 0.9019, "step": 450 }, { "epoch": 0.672106824925816, "grad_norm": 8.04867172241211, "learning_rate": 1.5e-05, "loss": 0.8331, "step": 453 }, { "epoch": 0.6765578635014837, "grad_norm": 8.826385498046875, "learning_rate": 1.5e-05, "loss": 0.8056, "step": 456 }, { "epoch": 0.6810089020771514, "grad_norm": 7.501665115356445, "learning_rate": 1.5e-05, "loss": 0.8137, "step": 459 }, { "epoch": 0.685459940652819, "grad_norm": 3.8520801067352295, "learning_rate": 1.5e-05, "loss": 0.8345, "step": 462 }, { "epoch": 0.6899109792284867, "grad_norm": 15.48876953125, "learning_rate": 1.5e-05, "loss": 0.859, "step": 465 }, { "epoch": 0.6943620178041543, "grad_norm": 12.54112720489502, "learning_rate": 1.5e-05, "loss": 0.8799, "step": 468 }, { "epoch": 0.6988130563798219, "grad_norm": 7.668098449707031, "learning_rate": 1.5e-05, "loss": 0.8135, "step": 471 }, { "epoch": 0.7032640949554896, "grad_norm": 13.388195991516113, "learning_rate": 1.5e-05, "loss": 0.8104, "step": 474 }, { "epoch": 0.7077151335311572, "grad_norm": 8.277421951293945, "learning_rate": 1.5e-05, "loss": 0.792, "step": 477 }, { "epoch": 0.712166172106825, "grad_norm": 10.426804542541504, "learning_rate": 1.5e-05, "loss": 0.8533, "step": 480 }, { "epoch": 0.7166172106824926, "grad_norm": 15.068408012390137, "learning_rate": 1.5e-05, "loss": 0.8912, "step": 483 }, { "epoch": 0.7210682492581603, "grad_norm": 7.566452980041504, "learning_rate": 1.5e-05, "loss": 0.8625, "step": 486 }, { "epoch": 0.7255192878338279, "grad_norm": 14.359679222106934, "learning_rate": 1.5e-05, "loss": 0.7973, "step": 489 }, { "epoch": 0.7299703264094956, "grad_norm": 8.746999740600586, "learning_rate": 1.5e-05, "loss": 0.9004, "step": 492 }, { "epoch": 0.7344213649851632, "grad_norm": 15.901468276977539, "learning_rate": 1.5e-05, "loss": 0.8499, "step": 495 }, { "epoch": 0.7388724035608308, "grad_norm": 5.345223903656006, "learning_rate": 1.5e-05, "loss": 0.9112, "step": 498 }, { "epoch": 0.7433234421364985, "grad_norm": 8.00938892364502, "learning_rate": 1.5e-05, "loss": 0.8521, "step": 501 }, { "epoch": 0.7477744807121661, "grad_norm": 7.601090431213379, "learning_rate": 1.5e-05, "loss": 0.8708, "step": 504 }, { "epoch": 0.7522255192878339, "grad_norm": 14.48643970489502, "learning_rate": 1.5e-05, "loss": 0.8476, "step": 507 }, { "epoch": 0.7566765578635015, "grad_norm": 16.052143096923828, "learning_rate": 1.5e-05, "loss": 0.8846, "step": 510 }, { "epoch": 0.7611275964391692, "grad_norm": 11.447772979736328, "learning_rate": 1.5e-05, "loss": 0.8062, "step": 513 }, { "epoch": 0.7655786350148368, "grad_norm": 7.909060955047607, "learning_rate": 1.5e-05, "loss": 0.8715, "step": 516 }, { "epoch": 0.7700296735905044, "grad_norm": 10.86801528930664, "learning_rate": 1.5e-05, "loss": 0.8102, "step": 519 }, { "epoch": 0.7744807121661721, "grad_norm": 6.530400276184082, "learning_rate": 1.5e-05, "loss": 0.8667, "step": 522 }, { "epoch": 0.7789317507418397, "grad_norm": 13.691858291625977, "learning_rate": 1.5e-05, "loss": 0.8159, "step": 525 }, { "epoch": 0.7833827893175074, "grad_norm": 8.416064262390137, "learning_rate": 1.5e-05, "loss": 0.8298, "step": 528 }, { "epoch": 0.787833827893175, "grad_norm": 8.614116668701172, "learning_rate": 1.5e-05, "loss": 0.8542, "step": 531 }, { "epoch": 0.7922848664688428, "grad_norm": 10.818787574768066, "learning_rate": 1.5e-05, "loss": 0.8476, "step": 534 }, { "epoch": 0.7967359050445104, "grad_norm": 12.394455909729004, "learning_rate": 1.5e-05, "loss": 0.884, "step": 537 }, { "epoch": 0.8011869436201781, "grad_norm": 7.285090446472168, "learning_rate": 1.5e-05, "loss": 0.8706, "step": 540 }, { "epoch": 0.8056379821958457, "grad_norm": 6.0529632568359375, "learning_rate": 1.5e-05, "loss": 0.8389, "step": 543 }, { "epoch": 0.8100890207715133, "grad_norm": 5.722935199737549, "learning_rate": 1.5e-05, "loss": 0.9546, "step": 546 }, { "epoch": 0.814540059347181, "grad_norm": 8.289714813232422, "learning_rate": 1.5e-05, "loss": 0.8797, "step": 549 }, { "epoch": 0.8189910979228486, "grad_norm": 11.81054973602295, "learning_rate": 1.5e-05, "loss": 0.9221, "step": 552 }, { "epoch": 0.8234421364985163, "grad_norm": 8.692460060119629, "learning_rate": 1.5e-05, "loss": 0.8738, "step": 555 }, { "epoch": 0.827893175074184, "grad_norm": 12.485095024108887, "learning_rate": 1.5e-05, "loss": 0.824, "step": 558 }, { "epoch": 0.8323442136498517, "grad_norm": 15.875768661499023, "learning_rate": 1.5e-05, "loss": 0.8862, "step": 561 }, { "epoch": 0.8367952522255193, "grad_norm": 11.125205039978027, "learning_rate": 1.5e-05, "loss": 0.8969, "step": 564 }, { "epoch": 0.841246290801187, "grad_norm": 9.451800346374512, "learning_rate": 1.5e-05, "loss": 0.8265, "step": 567 }, { "epoch": 0.8456973293768546, "grad_norm": 8.435980796813965, "learning_rate": 1.5e-05, "loss": 0.9067, "step": 570 }, { "epoch": 0.8501483679525222, "grad_norm": 7.994401931762695, "learning_rate": 1.5e-05, "loss": 0.8713, "step": 573 }, { "epoch": 0.8545994065281899, "grad_norm": 8.249290466308594, "learning_rate": 1.5e-05, "loss": 0.8681, "step": 576 }, { "epoch": 0.8590504451038575, "grad_norm": 10.910624504089355, "learning_rate": 1.5e-05, "loss": 0.8686, "step": 579 }, { "epoch": 0.8635014836795252, "grad_norm": 9.459715843200684, "learning_rate": 1.5e-05, "loss": 0.8453, "step": 582 }, { "epoch": 0.8679525222551929, "grad_norm": 11.252153396606445, "learning_rate": 1.5e-05, "loss": 0.8494, "step": 585 }, { "epoch": 0.8724035608308606, "grad_norm": 13.70508098602295, "learning_rate": 1.5e-05, "loss": 0.8768, "step": 588 }, { "epoch": 0.8768545994065282, "grad_norm": 5.890571117401123, "learning_rate": 1.5e-05, "loss": 0.855, "step": 591 }, { "epoch": 0.8813056379821959, "grad_norm": 9.50145149230957, "learning_rate": 1.5e-05, "loss": 0.8869, "step": 594 }, { "epoch": 0.8857566765578635, "grad_norm": 14.335087776184082, "learning_rate": 1.5e-05, "loss": 0.8234, "step": 597 }, { "epoch": 0.8902077151335311, "grad_norm": 7.306372165679932, "learning_rate": 1.5e-05, "loss": 0.8548, "step": 600 }, { "epoch": 0.8946587537091988, "grad_norm": 8.26121711730957, "learning_rate": 1.5e-05, "loss": 0.8765, "step": 603 }, { "epoch": 0.8991097922848664, "grad_norm": 9.257493019104004, "learning_rate": 1.5e-05, "loss": 0.8978, "step": 606 }, { "epoch": 0.9035608308605341, "grad_norm": 5.7045817375183105, "learning_rate": 1.5e-05, "loss": 0.8315, "step": 609 }, { "epoch": 0.9080118694362018, "grad_norm": 13.060375213623047, "learning_rate": 1.5e-05, "loss": 0.8272, "step": 612 }, { "epoch": 0.9124629080118695, "grad_norm": 7.294022560119629, "learning_rate": 1.5e-05, "loss": 0.8223, "step": 615 }, { "epoch": 0.9169139465875371, "grad_norm": 5.998393535614014, "learning_rate": 1.5e-05, "loss": 0.8029, "step": 618 }, { "epoch": 0.9213649851632048, "grad_norm": 4.604220390319824, "learning_rate": 1.5e-05, "loss": 0.8613, "step": 621 }, { "epoch": 0.9258160237388724, "grad_norm": 10.009174346923828, "learning_rate": 1.5e-05, "loss": 0.856, "step": 624 }, { "epoch": 0.93026706231454, "grad_norm": 6.437814235687256, "learning_rate": 1.5e-05, "loss": 0.9074, "step": 627 }, { "epoch": 0.9347181008902077, "grad_norm": 16.38670539855957, "learning_rate": 1.5e-05, "loss": 0.8116, "step": 630 }, { "epoch": 0.9391691394658753, "grad_norm": 7.195847511291504, "learning_rate": 1.5e-05, "loss": 0.8411, "step": 633 }, { "epoch": 0.9436201780415431, "grad_norm": 4.928689479827881, "learning_rate": 1.5e-05, "loss": 0.8057, "step": 636 }, { "epoch": 0.9480712166172107, "grad_norm": 4.637528896331787, "learning_rate": 1.5e-05, "loss": 0.8651, "step": 639 }, { "epoch": 0.9525222551928784, "grad_norm": 10.825565338134766, "learning_rate": 1.5e-05, "loss": 0.8757, "step": 642 }, { "epoch": 0.956973293768546, "grad_norm": 9.891039848327637, "learning_rate": 1.5e-05, "loss": 0.7774, "step": 645 }, { "epoch": 0.9614243323442137, "grad_norm": 6.30767297744751, "learning_rate": 1.5e-05, "loss": 0.8346, "step": 648 }, { "epoch": 0.9658753709198813, "grad_norm": 9.901351928710938, "learning_rate": 1.5e-05, "loss": 0.8439, "step": 651 }, { "epoch": 0.9703264094955489, "grad_norm": 8.528812408447266, "learning_rate": 1.5e-05, "loss": 0.922, "step": 654 }, { "epoch": 0.9747774480712166, "grad_norm": 8.85007381439209, "learning_rate": 1.5e-05, "loss": 0.8565, "step": 657 }, { "epoch": 0.9792284866468842, "grad_norm": 7.137876510620117, "learning_rate": 1.5e-05, "loss": 0.886, "step": 660 }, { "epoch": 0.983679525222552, "grad_norm": 12.115394592285156, "learning_rate": 1.5e-05, "loss": 0.9001, "step": 663 }, { "epoch": 0.9881305637982196, "grad_norm": 3.7681024074554443, "learning_rate": 1.5e-05, "loss": 0.8983, "step": 666 }, { "epoch": 0.9925816023738873, "grad_norm": 7.687930583953857, "learning_rate": 1.5e-05, "loss": 0.8629, "step": 669 }, { "epoch": 0.9970326409495549, "grad_norm": 5.996459007263184, "learning_rate": 1.5e-05, "loss": 0.8284, "step": 672 }, { "epoch": 1.0, "eval_loss": 2.075103521347046, "eval_runtime": 554.3952, "eval_samples_per_second": 2.781, "eval_steps_per_second": 0.348, "step": 674 }, { "epoch": 1.0014836795252227, "grad_norm": 10.066774368286133, "learning_rate": 1.5e-05, "loss": 0.8555, "step": 675 }, { "epoch": 1.0059347181008902, "grad_norm": 6.481128692626953, "learning_rate": 1.5e-05, "loss": 0.838, "step": 678 }, { "epoch": 1.010385756676558, "grad_norm": 12.874567985534668, "learning_rate": 1.5e-05, "loss": 0.8869, "step": 681 }, { "epoch": 1.0148367952522255, "grad_norm": 5.834901332855225, "learning_rate": 1.5e-05, "loss": 0.8517, "step": 684 }, { "epoch": 1.0192878338278932, "grad_norm": 15.476794242858887, "learning_rate": 1.5e-05, "loss": 0.8777, "step": 687 }, { "epoch": 1.0237388724035608, "grad_norm": 6.580478191375732, "learning_rate": 1.5e-05, "loss": 0.827, "step": 690 }, { "epoch": 1.0281899109792285, "grad_norm": 9.56643009185791, "learning_rate": 1.5e-05, "loss": 0.8886, "step": 693 }, { "epoch": 1.032640949554896, "grad_norm": 4.428914546966553, "learning_rate": 1.5e-05, "loss": 0.9552, "step": 696 }, { "epoch": 1.0370919881305638, "grad_norm": 4.398239612579346, "learning_rate": 1.5e-05, "loss": 0.845, "step": 699 }, { "epoch": 1.0415430267062316, "grad_norm": 5.540760040283203, "learning_rate": 1.5e-05, "loss": 0.8776, "step": 702 }, { "epoch": 1.045994065281899, "grad_norm": 15.209844589233398, "learning_rate": 1.5e-05, "loss": 0.9569, "step": 705 }, { "epoch": 1.0504451038575668, "grad_norm": 11.813831329345703, "learning_rate": 1.5e-05, "loss": 0.8452, "step": 708 }, { "epoch": 1.0548961424332344, "grad_norm": 8.536015510559082, "learning_rate": 1.5e-05, "loss": 0.8486, "step": 711 }, { "epoch": 1.0593471810089021, "grad_norm": 6.545129299163818, "learning_rate": 1.5e-05, "loss": 0.8856, "step": 714 }, { "epoch": 1.0637982195845697, "grad_norm": 8.14754581451416, "learning_rate": 1.5e-05, "loss": 0.8801, "step": 717 }, { "epoch": 1.0682492581602374, "grad_norm": 7.521109580993652, "learning_rate": 1.5e-05, "loss": 0.8708, "step": 720 }, { "epoch": 1.072700296735905, "grad_norm": 9.358808517456055, "learning_rate": 1.5e-05, "loss": 0.8476, "step": 723 }, { "epoch": 1.0771513353115727, "grad_norm": 6.190918922424316, "learning_rate": 1.5e-05, "loss": 0.8605, "step": 726 }, { "epoch": 1.0816023738872405, "grad_norm": 10.316658020019531, "learning_rate": 1.5e-05, "loss": 0.8518, "step": 729 }, { "epoch": 1.086053412462908, "grad_norm": 5.746811389923096, "learning_rate": 1.5e-05, "loss": 0.8707, "step": 732 }, { "epoch": 1.0905044510385757, "grad_norm": 7.9586663246154785, "learning_rate": 1.5e-05, "loss": 0.8193, "step": 735 }, { "epoch": 1.0949554896142433, "grad_norm": 6.76649808883667, "learning_rate": 1.5e-05, "loss": 0.8491, "step": 738 }, { "epoch": 1.099406528189911, "grad_norm": 7.164156436920166, "learning_rate": 1.5e-05, "loss": 0.8137, "step": 741 }, { "epoch": 1.1038575667655786, "grad_norm": 5.188474178314209, "learning_rate": 1.5e-05, "loss": 0.8105, "step": 744 }, { "epoch": 1.1083086053412463, "grad_norm": 11.81541633605957, "learning_rate": 1.5e-05, "loss": 0.8628, "step": 747 }, { "epoch": 1.1127596439169138, "grad_norm": 6.901886940002441, "learning_rate": 1.5e-05, "loss": 0.8279, "step": 750 }, { "epoch": 1.1172106824925816, "grad_norm": 5.522708892822266, "learning_rate": 1.5e-05, "loss": 0.834, "step": 753 }, { "epoch": 1.1216617210682494, "grad_norm": 10.341312408447266, "learning_rate": 1.5e-05, "loss": 0.7452, "step": 756 }, { "epoch": 1.1261127596439169, "grad_norm": 11.882563591003418, "learning_rate": 1.5e-05, "loss": 0.8481, "step": 759 }, { "epoch": 1.1305637982195846, "grad_norm": 4.872053146362305, "learning_rate": 1.5e-05, "loss": 0.8284, "step": 762 }, { "epoch": 1.1350148367952522, "grad_norm": 10.799345016479492, "learning_rate": 1.5e-05, "loss": 0.8375, "step": 765 }, { "epoch": 1.13946587537092, "grad_norm": 5.207538604736328, "learning_rate": 1.5e-05, "loss": 0.8427, "step": 768 }, { "epoch": 1.1439169139465875, "grad_norm": 12.862470626831055, "learning_rate": 1.5e-05, "loss": 0.8248, "step": 771 }, { "epoch": 1.1483679525222552, "grad_norm": 6.997878074645996, "learning_rate": 1.5e-05, "loss": 0.8827, "step": 774 }, { "epoch": 1.1528189910979227, "grad_norm": 5.541961669921875, "learning_rate": 1.5e-05, "loss": 0.8391, "step": 777 }, { "epoch": 1.1572700296735905, "grad_norm": 8.02748966217041, "learning_rate": 1.5e-05, "loss": 0.8518, "step": 780 }, { "epoch": 1.1617210682492582, "grad_norm": 10.839200973510742, "learning_rate": 1.5e-05, "loss": 0.8715, "step": 783 }, { "epoch": 1.1661721068249258, "grad_norm": 6.69924259185791, "learning_rate": 1.5e-05, "loss": 0.8505, "step": 786 }, { "epoch": 1.1706231454005935, "grad_norm": 15.232388496398926, "learning_rate": 1.5e-05, "loss": 0.8483, "step": 789 }, { "epoch": 1.175074183976261, "grad_norm": 7.842281341552734, "learning_rate": 1.5e-05, "loss": 0.8434, "step": 792 }, { "epoch": 1.1795252225519288, "grad_norm": 9.89548110961914, "learning_rate": 1.5e-05, "loss": 0.8434, "step": 795 }, { "epoch": 1.1839762611275964, "grad_norm": 12.349285125732422, "learning_rate": 1.5e-05, "loss": 0.8522, "step": 798 }, { "epoch": 1.188427299703264, "grad_norm": 6.074175834655762, "learning_rate": 1.5e-05, "loss": 0.8639, "step": 801 }, { "epoch": 1.1928783382789319, "grad_norm": 12.941549301147461, "learning_rate": 1.5e-05, "loss": 0.8215, "step": 804 }, { "epoch": 1.1973293768545994, "grad_norm": 6.716182708740234, "learning_rate": 1.5e-05, "loss": 0.8417, "step": 807 }, { "epoch": 1.2017804154302671, "grad_norm": 7.472183704376221, "learning_rate": 1.5e-05, "loss": 0.8061, "step": 810 }, { "epoch": 1.2062314540059347, "grad_norm": 9.26564884185791, "learning_rate": 1.5e-05, "loss": 0.8834, "step": 813 }, { "epoch": 1.2106824925816024, "grad_norm": 12.621788024902344, "learning_rate": 1.5e-05, "loss": 0.8299, "step": 816 }, { "epoch": 1.21513353115727, "grad_norm": 8.867362976074219, "learning_rate": 1.5e-05, "loss": 0.8374, "step": 819 }, { "epoch": 1.2195845697329377, "grad_norm": 12.072689056396484, "learning_rate": 1.5e-05, "loss": 0.8223, "step": 822 }, { "epoch": 1.2240356083086052, "grad_norm": 10.037847518920898, "learning_rate": 1.5e-05, "loss": 0.7896, "step": 825 }, { "epoch": 1.228486646884273, "grad_norm": 7.16823148727417, "learning_rate": 1.5e-05, "loss": 0.7912, "step": 828 }, { "epoch": 1.2329376854599405, "grad_norm": 13.862353324890137, "learning_rate": 1.5e-05, "loss": 0.8545, "step": 831 }, { "epoch": 1.2373887240356083, "grad_norm": 6.668301582336426, "learning_rate": 1.5e-05, "loss": 0.7566, "step": 834 }, { "epoch": 1.241839762611276, "grad_norm": 10.224084854125977, "learning_rate": 1.5e-05, "loss": 0.8056, "step": 837 }, { "epoch": 1.2462908011869436, "grad_norm": 6.452188014984131, "learning_rate": 1.5e-05, "loss": 0.82, "step": 840 }, { "epoch": 1.2507418397626113, "grad_norm": 7.246963024139404, "learning_rate": 1.5e-05, "loss": 0.8351, "step": 843 }, { "epoch": 1.2551928783382789, "grad_norm": 6.421880722045898, "learning_rate": 1.5e-05, "loss": 0.8563, "step": 846 }, { "epoch": 1.2596439169139466, "grad_norm": 6.952515602111816, "learning_rate": 1.5e-05, "loss": 0.9159, "step": 849 }, { "epoch": 1.2640949554896141, "grad_norm": 4.98225212097168, "learning_rate": 1.5e-05, "loss": 0.8609, "step": 852 }, { "epoch": 1.268545994065282, "grad_norm": 7.60207462310791, "learning_rate": 1.5e-05, "loss": 0.8765, "step": 855 }, { "epoch": 1.2729970326409497, "grad_norm": 7.847710609436035, "learning_rate": 1.5e-05, "loss": 0.8208, "step": 858 }, { "epoch": 1.2774480712166172, "grad_norm": 8.309576988220215, "learning_rate": 1.5e-05, "loss": 0.8579, "step": 861 }, { "epoch": 1.281899109792285, "grad_norm": 8.306654930114746, "learning_rate": 1.5e-05, "loss": 0.8137, "step": 864 }, { "epoch": 1.2863501483679525, "grad_norm": 7.458807945251465, "learning_rate": 1.5e-05, "loss": 0.8503, "step": 867 }, { "epoch": 1.2908011869436202, "grad_norm": 5.15773344039917, "learning_rate": 1.5e-05, "loss": 0.8074, "step": 870 }, { "epoch": 1.2952522255192878, "grad_norm": 5.930022716522217, "learning_rate": 1.5e-05, "loss": 0.7877, "step": 873 }, { "epoch": 1.2997032640949555, "grad_norm": 10.39821720123291, "learning_rate": 1.5e-05, "loss": 0.7825, "step": 876 }, { "epoch": 1.3041543026706233, "grad_norm": 10.691167831420898, "learning_rate": 1.5e-05, "loss": 0.84, "step": 879 }, { "epoch": 1.3086053412462908, "grad_norm": 11.49881362915039, "learning_rate": 1.5e-05, "loss": 0.8858, "step": 882 }, { "epoch": 1.3130563798219583, "grad_norm": 8.16782283782959, "learning_rate": 1.5e-05, "loss": 0.8237, "step": 885 }, { "epoch": 1.317507418397626, "grad_norm": 5.213159561157227, "learning_rate": 1.5e-05, "loss": 0.776, "step": 888 }, { "epoch": 1.3219584569732938, "grad_norm": 7.029541969299316, "learning_rate": 1.5e-05, "loss": 0.8448, "step": 891 }, { "epoch": 1.3264094955489614, "grad_norm": 5.071165084838867, "learning_rate": 1.5e-05, "loss": 0.915, "step": 894 }, { "epoch": 1.3308605341246291, "grad_norm": 8.019569396972656, "learning_rate": 1.5e-05, "loss": 0.873, "step": 897 }, { "epoch": 1.3353115727002967, "grad_norm": 8.721610069274902, "learning_rate": 1.5e-05, "loss": 0.8049, "step": 900 }, { "epoch": 1.3397626112759644, "grad_norm": 15.425809860229492, "learning_rate": 1.5e-05, "loss": 0.8703, "step": 903 }, { "epoch": 1.344213649851632, "grad_norm": 10.104765892028809, "learning_rate": 1.5e-05, "loss": 0.8404, "step": 906 }, { "epoch": 1.3486646884272997, "grad_norm": 6.787662506103516, "learning_rate": 1.5e-05, "loss": 0.8585, "step": 909 }, { "epoch": 1.3531157270029674, "grad_norm": 10.807848930358887, "learning_rate": 1.5e-05, "loss": 0.8074, "step": 912 }, { "epoch": 1.357566765578635, "grad_norm": 4.6103129386901855, "learning_rate": 1.5e-05, "loss": 0.8287, "step": 915 }, { "epoch": 1.3620178041543027, "grad_norm": 7.826140880584717, "learning_rate": 1.5e-05, "loss": 0.8319, "step": 918 }, { "epoch": 1.3664688427299703, "grad_norm": 4.535531044006348, "learning_rate": 1.5e-05, "loss": 0.8546, "step": 921 }, { "epoch": 1.370919881305638, "grad_norm": 6.764124870300293, "learning_rate": 1.5e-05, "loss": 0.8228, "step": 924 }, { "epoch": 1.3753709198813056, "grad_norm": 9.835914611816406, "learning_rate": 1.5e-05, "loss": 0.8356, "step": 927 }, { "epoch": 1.3798219584569733, "grad_norm": 10.747434616088867, "learning_rate": 1.5e-05, "loss": 0.8648, "step": 930 }, { "epoch": 1.384272997032641, "grad_norm": 7.541149139404297, "learning_rate": 1.5e-05, "loss": 0.8486, "step": 933 }, { "epoch": 1.3887240356083086, "grad_norm": 6.978203296661377, "learning_rate": 1.5e-05, "loss": 0.8054, "step": 936 }, { "epoch": 1.3931750741839761, "grad_norm": 15.082099914550781, "learning_rate": 1.5e-05, "loss": 0.8219, "step": 939 }, { "epoch": 1.3976261127596439, "grad_norm": 8.855502128601074, "learning_rate": 1.5e-05, "loss": 0.8023, "step": 942 }, { "epoch": 1.4020771513353116, "grad_norm": 4.794929027557373, "learning_rate": 1.5e-05, "loss": 0.8232, "step": 945 }, { "epoch": 1.4065281899109792, "grad_norm": 5.567296028137207, "learning_rate": 1.5e-05, "loss": 0.7989, "step": 948 }, { "epoch": 1.410979228486647, "grad_norm": 9.492593765258789, "learning_rate": 1.5e-05, "loss": 0.8452, "step": 951 }, { "epoch": 1.4154302670623147, "grad_norm": 7.953827857971191, "learning_rate": 1.5e-05, "loss": 0.8377, "step": 954 }, { "epoch": 1.4198813056379822, "grad_norm": 9.085283279418945, "learning_rate": 1.5e-05, "loss": 0.8983, "step": 957 }, { "epoch": 1.4243323442136497, "grad_norm": 8.406304359436035, "learning_rate": 1.5e-05, "loss": 0.8422, "step": 960 }, { "epoch": 1.4287833827893175, "grad_norm": 5.686973571777344, "learning_rate": 1.5e-05, "loss": 0.8542, "step": 963 }, { "epoch": 1.4332344213649852, "grad_norm": 9.381924629211426, "learning_rate": 1.5e-05, "loss": 0.8904, "step": 966 }, { "epoch": 1.4376854599406528, "grad_norm": 4.451043128967285, "learning_rate": 1.5e-05, "loss": 0.8172, "step": 969 }, { "epoch": 1.4421364985163205, "grad_norm": 7.336870193481445, "learning_rate": 1.5e-05, "loss": 0.8578, "step": 972 }, { "epoch": 1.446587537091988, "grad_norm": 8.10446548461914, "learning_rate": 1.5e-05, "loss": 0.8826, "step": 975 }, { "epoch": 1.4510385756676558, "grad_norm": 8.376605033874512, "learning_rate": 1.5e-05, "loss": 0.8302, "step": 978 }, { "epoch": 1.4554896142433233, "grad_norm": 11.178180694580078, "learning_rate": 1.5e-05, "loss": 0.8797, "step": 981 }, { "epoch": 1.459940652818991, "grad_norm": 10.056670188903809, "learning_rate": 1.5e-05, "loss": 0.8314, "step": 984 }, { "epoch": 1.4643916913946589, "grad_norm": 5.353207588195801, "learning_rate": 1.5e-05, "loss": 0.823, "step": 987 }, { "epoch": 1.4688427299703264, "grad_norm": 10.648890495300293, "learning_rate": 1.5e-05, "loss": 0.8789, "step": 990 }, { "epoch": 1.4732937685459941, "grad_norm": 5.265453338623047, "learning_rate": 1.5e-05, "loss": 0.8392, "step": 993 }, { "epoch": 1.4777448071216617, "grad_norm": 4.404312610626221, "learning_rate": 1.5e-05, "loss": 0.8659, "step": 996 }, { "epoch": 1.4821958456973294, "grad_norm": 7.063133716583252, "learning_rate": 1.5e-05, "loss": 0.8903, "step": 999 }, { "epoch": 1.486646884272997, "grad_norm": 12.400032043457031, "learning_rate": 1.5e-05, "loss": 0.8696, "step": 1002 }, { "epoch": 1.4910979228486647, "grad_norm": 8.297316551208496, "learning_rate": 1.5e-05, "loss": 0.9001, "step": 1005 }, { "epoch": 1.4955489614243325, "grad_norm": 11.91292667388916, "learning_rate": 1.5e-05, "loss": 0.8941, "step": 1008 }, { "epoch": 1.5, "grad_norm": 6.494741916656494, "learning_rate": 1.5e-05, "loss": 0.8534, "step": 1011 }, { "epoch": 1.5044510385756675, "grad_norm": 11.065376281738281, "learning_rate": 1.5e-05, "loss": 0.8624, "step": 1014 }, { "epoch": 1.5089020771513353, "grad_norm": 7.178919315338135, "learning_rate": 1.5e-05, "loss": 0.8742, "step": 1017 }, { "epoch": 1.513353115727003, "grad_norm": 5.641129493713379, "learning_rate": 1.5e-05, "loss": 0.8493, "step": 1020 }, { "epoch": 1.5178041543026706, "grad_norm": 14.277860641479492, "learning_rate": 1.5e-05, "loss": 0.8686, "step": 1023 }, { "epoch": 1.5222551928783383, "grad_norm": 9.708137512207031, "learning_rate": 1.5e-05, "loss": 0.83, "step": 1026 }, { "epoch": 1.526706231454006, "grad_norm": 7.91434383392334, "learning_rate": 1.5e-05, "loss": 0.841, "step": 1029 }, { "epoch": 1.5311572700296736, "grad_norm": 14.816337585449219, "learning_rate": 1.5e-05, "loss": 0.9307, "step": 1032 }, { "epoch": 1.5356083086053411, "grad_norm": 12.463879585266113, "learning_rate": 1.5e-05, "loss": 0.8844, "step": 1035 }, { "epoch": 1.540059347181009, "grad_norm": 7.6568217277526855, "learning_rate": 1.5e-05, "loss": 0.9198, "step": 1038 }, { "epoch": 1.5445103857566767, "grad_norm": 11.649917602539062, "learning_rate": 1.5e-05, "loss": 0.9174, "step": 1041 }, { "epoch": 1.5489614243323442, "grad_norm": 9.973616600036621, "learning_rate": 1.5e-05, "loss": 0.9474, "step": 1044 }, { "epoch": 1.5534124629080117, "grad_norm": 11.569575309753418, "learning_rate": 1.5e-05, "loss": 0.9166, "step": 1047 }, { "epoch": 1.5578635014836797, "grad_norm": 11.283512115478516, "learning_rate": 1.5e-05, "loss": 0.8188, "step": 1050 }, { "epoch": 1.5623145400593472, "grad_norm": 6.829236030578613, "learning_rate": 1.5e-05, "loss": 0.8318, "step": 1053 }, { "epoch": 1.5667655786350148, "grad_norm": 10.211257934570312, "learning_rate": 1.5e-05, "loss": 0.8139, "step": 1056 }, { "epoch": 1.5712166172106825, "grad_norm": 6.259841442108154, "learning_rate": 1.5e-05, "loss": 0.8444, "step": 1059 }, { "epoch": 1.5756676557863503, "grad_norm": 14.19024658203125, "learning_rate": 1.5e-05, "loss": 0.8641, "step": 1062 }, { "epoch": 1.5801186943620178, "grad_norm": 6.3594231605529785, "learning_rate": 1.5e-05, "loss": 0.9041, "step": 1065 }, { "epoch": 1.5845697329376853, "grad_norm": 9.81156063079834, "learning_rate": 1.5e-05, "loss": 0.8703, "step": 1068 }, { "epoch": 1.589020771513353, "grad_norm": 6.122777938842773, "learning_rate": 1.5e-05, "loss": 0.8309, "step": 1071 }, { "epoch": 1.5934718100890208, "grad_norm": 11.714445114135742, "learning_rate": 1.5e-05, "loss": 0.8151, "step": 1074 }, { "epoch": 1.5979228486646884, "grad_norm": 12.073863983154297, "learning_rate": 1.5e-05, "loss": 0.9148, "step": 1077 }, { "epoch": 1.6023738872403561, "grad_norm": 8.177748680114746, "learning_rate": 1.5e-05, "loss": 0.865, "step": 1080 }, { "epoch": 1.6068249258160239, "grad_norm": 13.913122177124023, "learning_rate": 1.5e-05, "loss": 0.8409, "step": 1083 }, { "epoch": 1.6112759643916914, "grad_norm": 8.375801086425781, "learning_rate": 1.5e-05, "loss": 0.7768, "step": 1086 }, { "epoch": 1.615727002967359, "grad_norm": 6.173603057861328, "learning_rate": 1.5e-05, "loss": 0.8343, "step": 1089 }, { "epoch": 1.6201780415430267, "grad_norm": 10.390620231628418, "learning_rate": 1.5e-05, "loss": 0.8338, "step": 1092 }, { "epoch": 1.6246290801186944, "grad_norm": 8.413612365722656, "learning_rate": 1.5e-05, "loss": 0.8517, "step": 1095 }, { "epoch": 1.629080118694362, "grad_norm": 9.790428161621094, "learning_rate": 1.5e-05, "loss": 0.8443, "step": 1098 }, { "epoch": 1.6335311572700295, "grad_norm": 13.228864669799805, "learning_rate": 1.5e-05, "loss": 0.8481, "step": 1101 }, { "epoch": 1.6379821958456975, "grad_norm": 11.918046951293945, "learning_rate": 1.5e-05, "loss": 0.8901, "step": 1104 }, { "epoch": 1.642433234421365, "grad_norm": 6.354975700378418, "learning_rate": 1.5e-05, "loss": 0.8817, "step": 1107 }, { "epoch": 1.6468842729970326, "grad_norm": 10.373885154724121, "learning_rate": 1.5e-05, "loss": 0.8443, "step": 1110 }, { "epoch": 1.6513353115727003, "grad_norm": 7.181490421295166, "learning_rate": 1.5e-05, "loss": 0.857, "step": 1113 }, { "epoch": 1.655786350148368, "grad_norm": 8.490324020385742, "learning_rate": 1.5e-05, "loss": 0.8714, "step": 1116 }, { "epoch": 1.6602373887240356, "grad_norm": 5.962569236755371, "learning_rate": 1.5e-05, "loss": 0.7782, "step": 1119 }, { "epoch": 1.6646884272997031, "grad_norm": 7.268184185028076, "learning_rate": 1.5e-05, "loss": 0.8829, "step": 1122 }, { "epoch": 1.6691394658753709, "grad_norm": 9.73929500579834, "learning_rate": 1.5e-05, "loss": 0.8069, "step": 1125 }, { "epoch": 1.6735905044510386, "grad_norm": 8.92696762084961, "learning_rate": 1.5e-05, "loss": 0.8727, "step": 1128 }, { "epoch": 1.6780415430267062, "grad_norm": 7.317033767700195, "learning_rate": 1.5e-05, "loss": 0.8104, "step": 1131 }, { "epoch": 1.682492581602374, "grad_norm": 6.796001434326172, "learning_rate": 1.5e-05, "loss": 0.9314, "step": 1134 }, { "epoch": 1.6869436201780417, "grad_norm": 8.300507545471191, "learning_rate": 1.5e-05, "loss": 0.8108, "step": 1137 }, { "epoch": 1.6913946587537092, "grad_norm": 14.353339195251465, "learning_rate": 1.5e-05, "loss": 0.8735, "step": 1140 }, { "epoch": 1.6958456973293767, "grad_norm": 8.713440895080566, "learning_rate": 1.5e-05, "loss": 0.8634, "step": 1143 }, { "epoch": 1.7002967359050445, "grad_norm": 12.015419960021973, "learning_rate": 1.5e-05, "loss": 0.8491, "step": 1146 }, { "epoch": 1.7047477744807122, "grad_norm": 5.322451114654541, "learning_rate": 1.5e-05, "loss": 0.8491, "step": 1149 }, { "epoch": 1.7091988130563798, "grad_norm": 7.663971900939941, "learning_rate": 1.5e-05, "loss": 0.841, "step": 1152 }, { "epoch": 1.7136498516320475, "grad_norm": 9.272565841674805, "learning_rate": 1.5e-05, "loss": 0.9399, "step": 1155 }, { "epoch": 1.7181008902077153, "grad_norm": 6.013884544372559, "learning_rate": 1.5e-05, "loss": 0.8239, "step": 1158 }, { "epoch": 1.7225519287833828, "grad_norm": 9.190864562988281, "learning_rate": 1.5e-05, "loss": 0.8211, "step": 1161 }, { "epoch": 1.7270029673590503, "grad_norm": 9.801536560058594, "learning_rate": 1.5e-05, "loss": 0.8571, "step": 1164 }, { "epoch": 1.731454005934718, "grad_norm": 13.254154205322266, "learning_rate": 1.5e-05, "loss": 0.8762, "step": 1167 }, { "epoch": 1.7359050445103859, "grad_norm": 10.48544979095459, "learning_rate": 1.5e-05, "loss": 0.7882, "step": 1170 }, { "epoch": 1.7403560830860534, "grad_norm": 9.48491382598877, "learning_rate": 1.5e-05, "loss": 0.8939, "step": 1173 }, { "epoch": 1.744807121661721, "grad_norm": 8.662673950195312, "learning_rate": 1.5e-05, "loss": 0.8562, "step": 1176 }, { "epoch": 1.7492581602373887, "grad_norm": 11.683974266052246, "learning_rate": 1.5e-05, "loss": 0.8178, "step": 1179 }, { "epoch": 1.7537091988130564, "grad_norm": 17.12523078918457, "learning_rate": 1.5e-05, "loss": 0.8192, "step": 1182 }, { "epoch": 1.758160237388724, "grad_norm": 4.6900835037231445, "learning_rate": 1.5e-05, "loss": 0.8683, "step": 1185 }, { "epoch": 1.7626112759643917, "grad_norm": 7.892794132232666, "learning_rate": 1.5e-05, "loss": 0.8594, "step": 1188 }, { "epoch": 1.7670623145400595, "grad_norm": 9.247455596923828, "learning_rate": 1.5e-05, "loss": 0.858, "step": 1191 }, { "epoch": 1.771513353115727, "grad_norm": 7.50583028793335, "learning_rate": 1.5e-05, "loss": 0.7865, "step": 1194 }, { "epoch": 1.7759643916913945, "grad_norm": 4.668313503265381, "learning_rate": 1.5e-05, "loss": 0.8215, "step": 1197 }, { "epoch": 1.7804154302670623, "grad_norm": 10.414295196533203, "learning_rate": 1.5e-05, "loss": 0.8199, "step": 1200 }, { "epoch": 1.78486646884273, "grad_norm": 4.297197341918945, "learning_rate": 1.5e-05, "loss": 0.8129, "step": 1203 }, { "epoch": 1.7893175074183976, "grad_norm": 9.394143104553223, "learning_rate": 1.5e-05, "loss": 0.8624, "step": 1206 }, { "epoch": 1.7937685459940653, "grad_norm": 8.61468505859375, "learning_rate": 1.5e-05, "loss": 0.797, "step": 1209 }, { "epoch": 1.798219584569733, "grad_norm": 8.216081619262695, "learning_rate": 1.5e-05, "loss": 0.8217, "step": 1212 }, { "epoch": 1.8026706231454006, "grad_norm": 7.414550304412842, "learning_rate": 1.5e-05, "loss": 0.8259, "step": 1215 }, { "epoch": 1.8071216617210681, "grad_norm": 7.1664042472839355, "learning_rate": 1.5e-05, "loss": 0.8568, "step": 1218 }, { "epoch": 1.811572700296736, "grad_norm": 6.590891361236572, "learning_rate": 1.5e-05, "loss": 0.8269, "step": 1221 }, { "epoch": 1.8160237388724036, "grad_norm": 8.408268928527832, "learning_rate": 1.5e-05, "loss": 0.9052, "step": 1224 }, { "epoch": 1.8204747774480712, "grad_norm": 19.62491226196289, "learning_rate": 1.5e-05, "loss": 0.8959, "step": 1227 }, { "epoch": 1.8249258160237387, "grad_norm": 11.636604309082031, "learning_rate": 1.5e-05, "loss": 0.8714, "step": 1230 }, { "epoch": 1.8293768545994067, "grad_norm": 8.018316268920898, "learning_rate": 1.5e-05, "loss": 0.8673, "step": 1233 }, { "epoch": 1.8338278931750742, "grad_norm": 4.958278179168701, "learning_rate": 1.5e-05, "loss": 0.8551, "step": 1236 }, { "epoch": 1.8382789317507418, "grad_norm": 13.244430541992188, "learning_rate": 1.5e-05, "loss": 0.8011, "step": 1239 }, { "epoch": 1.8427299703264095, "grad_norm": 7.185425281524658, "learning_rate": 1.5e-05, "loss": 0.7873, "step": 1242 }, { "epoch": 1.8471810089020773, "grad_norm": 5.537222862243652, "learning_rate": 1.5e-05, "loss": 0.8237, "step": 1245 }, { "epoch": 1.8516320474777448, "grad_norm": 5.888150215148926, "learning_rate": 1.5e-05, "loss": 0.8278, "step": 1248 }, { "epoch": 1.8560830860534123, "grad_norm": 7.887198448181152, "learning_rate": 1.5e-05, "loss": 0.8399, "step": 1251 }, { "epoch": 1.86053412462908, "grad_norm": 8.108527183532715, "learning_rate": 1.5e-05, "loss": 0.8711, "step": 1254 }, { "epoch": 1.8649851632047478, "grad_norm": 4.459034442901611, "learning_rate": 1.5e-05, "loss": 0.9171, "step": 1257 }, { "epoch": 1.8694362017804154, "grad_norm": 4.293658256530762, "learning_rate": 1.5e-05, "loss": 0.882, "step": 1260 }, { "epoch": 1.8738872403560831, "grad_norm": 6.042054176330566, "learning_rate": 1.5e-05, "loss": 0.8623, "step": 1263 }, { "epoch": 1.8783382789317509, "grad_norm": 11.530425071716309, "learning_rate": 1.5e-05, "loss": 0.9091, "step": 1266 }, { "epoch": 1.8827893175074184, "grad_norm": 7.389677047729492, "learning_rate": 1.5e-05, "loss": 0.8579, "step": 1269 }, { "epoch": 1.887240356083086, "grad_norm": 10.24569034576416, "learning_rate": 1.5e-05, "loss": 0.8281, "step": 1272 }, { "epoch": 1.8916913946587537, "grad_norm": 9.817954063415527, "learning_rate": 1.5e-05, "loss": 0.8575, "step": 1275 }, { "epoch": 1.8961424332344214, "grad_norm": 11.875582695007324, "learning_rate": 1.5e-05, "loss": 0.8375, "step": 1278 }, { "epoch": 1.900593471810089, "grad_norm": 7.8601837158203125, "learning_rate": 1.5e-05, "loss": 0.875, "step": 1281 }, { "epoch": 1.9050445103857567, "grad_norm": 6.628482341766357, "learning_rate": 1.5e-05, "loss": 0.8568, "step": 1284 }, { "epoch": 1.9094955489614245, "grad_norm": 14.135390281677246, "learning_rate": 1.5e-05, "loss": 0.7945, "step": 1287 }, { "epoch": 1.913946587537092, "grad_norm": 7.571518421173096, "learning_rate": 1.5e-05, "loss": 0.8387, "step": 1290 }, { "epoch": 1.9183976261127595, "grad_norm": 4.84207010269165, "learning_rate": 1.5e-05, "loss": 0.8573, "step": 1293 }, { "epoch": 1.9228486646884273, "grad_norm": 7.50210428237915, "learning_rate": 1.5e-05, "loss": 0.8396, "step": 1296 }, { "epoch": 1.927299703264095, "grad_norm": 10.158517837524414, "learning_rate": 1.5e-05, "loss": 0.8407, "step": 1299 }, { "epoch": 1.9317507418397626, "grad_norm": 4.945800304412842, "learning_rate": 1.5e-05, "loss": 0.849, "step": 1302 }, { "epoch": 1.9362017804154301, "grad_norm": 5.40016508102417, "learning_rate": 1.5e-05, "loss": 0.7703, "step": 1305 }, { "epoch": 1.9406528189910979, "grad_norm": 7.8204665184021, "learning_rate": 1.5e-05, "loss": 0.8625, "step": 1308 }, { "epoch": 1.9451038575667656, "grad_norm": 6.786766052246094, "learning_rate": 1.5e-05, "loss": 0.8816, "step": 1311 }, { "epoch": 1.9495548961424332, "grad_norm": 6.751473903656006, "learning_rate": 1.5e-05, "loss": 0.8586, "step": 1314 }, { "epoch": 1.954005934718101, "grad_norm": 9.781673431396484, "learning_rate": 1.5e-05, "loss": 0.8348, "step": 1317 }, { "epoch": 1.9584569732937687, "grad_norm": 14.07801628112793, "learning_rate": 1.5e-05, "loss": 0.9011, "step": 1320 }, { "epoch": 1.9629080118694362, "grad_norm": 10.769022941589355, "learning_rate": 1.5e-05, "loss": 0.8547, "step": 1323 }, { "epoch": 1.9673590504451037, "grad_norm": 5.165210723876953, "learning_rate": 1.5e-05, "loss": 0.8819, "step": 1326 }, { "epoch": 1.9718100890207715, "grad_norm": 6.151379108428955, "learning_rate": 1.5e-05, "loss": 0.8767, "step": 1329 }, { "epoch": 1.9762611275964392, "grad_norm": 8.154912948608398, "learning_rate": 1.5e-05, "loss": 0.8856, "step": 1332 }, { "epoch": 1.9807121661721068, "grad_norm": 7.511419773101807, "learning_rate": 1.5e-05, "loss": 0.835, "step": 1335 }, { "epoch": 1.9851632047477745, "grad_norm": 8.648750305175781, "learning_rate": 1.5e-05, "loss": 0.8391, "step": 1338 }, { "epoch": 1.9896142433234423, "grad_norm": 5.8288984298706055, "learning_rate": 1.5e-05, "loss": 0.8746, "step": 1341 }, { "epoch": 1.9940652818991098, "grad_norm": 7.342560768127441, "learning_rate": 1.5e-05, "loss": 0.8591, "step": 1344 }, { "epoch": 1.9985163204747773, "grad_norm": 9.906895637512207, "learning_rate": 1.5e-05, "loss": 0.8925, "step": 1347 }, { "epoch": 2.0, "eval_loss": 2.071904420852661, "eval_runtime": 553.6606, "eval_samples_per_second": 2.785, "eval_steps_per_second": 0.349, "step": 1348 }, { "epoch": 2.0029673590504453, "grad_norm": 11.174464225769043, "learning_rate": 1.5e-05, "loss": 0.8723, "step": 1350 }, { "epoch": 2.007418397626113, "grad_norm": 6.163029193878174, "learning_rate": 1.5e-05, "loss": 0.8487, "step": 1353 }, { "epoch": 2.0118694362017804, "grad_norm": 11.33940601348877, "learning_rate": 1.5e-05, "loss": 0.8795, "step": 1356 }, { "epoch": 2.016320474777448, "grad_norm": 15.676403999328613, "learning_rate": 1.5e-05, "loss": 0.91, "step": 1359 }, { "epoch": 2.020771513353116, "grad_norm": 13.067048072814941, "learning_rate": 1.5e-05, "loss": 0.8249, "step": 1362 }, { "epoch": 2.0252225519287834, "grad_norm": 9.354158401489258, "learning_rate": 1.5e-05, "loss": 0.8865, "step": 1365 }, { "epoch": 2.029673590504451, "grad_norm": 5.574648380279541, "learning_rate": 1.5e-05, "loss": 0.8637, "step": 1368 }, { "epoch": 2.0341246290801185, "grad_norm": 15.917570114135742, "learning_rate": 1.5e-05, "loss": 0.8854, "step": 1371 }, { "epoch": 2.0385756676557865, "grad_norm": 5.499011516571045, "learning_rate": 1.5e-05, "loss": 0.8468, "step": 1374 }, { "epoch": 2.043026706231454, "grad_norm": 15.698616027832031, "learning_rate": 1.5e-05, "loss": 0.821, "step": 1377 }, { "epoch": 2.0474777448071215, "grad_norm": 6.169116497039795, "learning_rate": 1.5e-05, "loss": 0.8785, "step": 1380 }, { "epoch": 2.0519287833827895, "grad_norm": 8.438339233398438, "learning_rate": 1.5e-05, "loss": 0.8298, "step": 1383 }, { "epoch": 2.056379821958457, "grad_norm": 13.13904857635498, "learning_rate": 1.5e-05, "loss": 0.8698, "step": 1386 }, { "epoch": 2.0608308605341246, "grad_norm": 5.194973468780518, "learning_rate": 1.5e-05, "loss": 0.8581, "step": 1389 }, { "epoch": 2.065281899109792, "grad_norm": 6.09019660949707, "learning_rate": 1.5e-05, "loss": 0.8442, "step": 1392 }, { "epoch": 2.06973293768546, "grad_norm": 12.113000869750977, "learning_rate": 1.5e-05, "loss": 0.8349, "step": 1395 }, { "epoch": 2.0741839762611276, "grad_norm": 8.027348518371582, "learning_rate": 1.5e-05, "loss": 0.8507, "step": 1398 }, { "epoch": 2.078635014836795, "grad_norm": 11.222186088562012, "learning_rate": 1.5e-05, "loss": 0.8345, "step": 1401 }, { "epoch": 2.083086053412463, "grad_norm": 7.976278781890869, "learning_rate": 1.5e-05, "loss": 0.8036, "step": 1404 }, { "epoch": 2.0875370919881306, "grad_norm": 9.854942321777344, "learning_rate": 1.5e-05, "loss": 0.8363, "step": 1407 }, { "epoch": 2.091988130563798, "grad_norm": 11.801050186157227, "learning_rate": 1.5e-05, "loss": 0.7906, "step": 1410 }, { "epoch": 2.0964391691394657, "grad_norm": 9.733396530151367, "learning_rate": 1.5e-05, "loss": 0.8425, "step": 1413 }, { "epoch": 2.1008902077151337, "grad_norm": 11.16501235961914, "learning_rate": 1.5e-05, "loss": 0.7729, "step": 1416 }, { "epoch": 2.105341246290801, "grad_norm": 10.145631790161133, "learning_rate": 1.5e-05, "loss": 0.8017, "step": 1419 }, { "epoch": 2.1097922848664687, "grad_norm": 4.5289764404296875, "learning_rate": 1.5e-05, "loss": 0.9057, "step": 1422 }, { "epoch": 2.1142433234421363, "grad_norm": 6.727800369262695, "learning_rate": 1.5e-05, "loss": 0.8651, "step": 1425 }, { "epoch": 2.1186943620178043, "grad_norm": 11.357308387756348, "learning_rate": 1.5e-05, "loss": 0.8358, "step": 1428 }, { "epoch": 2.123145400593472, "grad_norm": 6.047675609588623, "learning_rate": 1.5e-05, "loss": 0.8265, "step": 1431 }, { "epoch": 2.1275964391691393, "grad_norm": 8.08861255645752, "learning_rate": 1.5e-05, "loss": 0.8374, "step": 1434 }, { "epoch": 2.1320474777448073, "grad_norm": 7.7563958168029785, "learning_rate": 1.5e-05, "loss": 0.8818, "step": 1437 }, { "epoch": 2.136498516320475, "grad_norm": 7.988875865936279, "learning_rate": 1.5e-05, "loss": 0.8241, "step": 1440 }, { "epoch": 2.1409495548961424, "grad_norm": 12.524341583251953, "learning_rate": 1.5e-05, "loss": 0.7918, "step": 1443 }, { "epoch": 2.14540059347181, "grad_norm": 6.229768753051758, "learning_rate": 1.5e-05, "loss": 0.862, "step": 1446 }, { "epoch": 2.149851632047478, "grad_norm": 8.271695137023926, "learning_rate": 1.5e-05, "loss": 0.8522, "step": 1449 }, { "epoch": 2.1543026706231454, "grad_norm": 5.045875072479248, "learning_rate": 1.5e-05, "loss": 0.8574, "step": 1452 }, { "epoch": 2.158753709198813, "grad_norm": 11.379587173461914, "learning_rate": 1.5e-05, "loss": 0.8524, "step": 1455 }, { "epoch": 2.163204747774481, "grad_norm": 8.184687614440918, "learning_rate": 1.5e-05, "loss": 0.84, "step": 1458 }, { "epoch": 2.1676557863501484, "grad_norm": 9.615589141845703, "learning_rate": 1.5e-05, "loss": 0.8334, "step": 1461 }, { "epoch": 2.172106824925816, "grad_norm": 18.80459213256836, "learning_rate": 1.5e-05, "loss": 0.805, "step": 1464 }, { "epoch": 2.1765578635014835, "grad_norm": 14.540130615234375, "learning_rate": 1.5e-05, "loss": 0.8749, "step": 1467 }, { "epoch": 2.1810089020771515, "grad_norm": 6.465779781341553, "learning_rate": 1.5e-05, "loss": 0.8481, "step": 1470 }, { "epoch": 2.185459940652819, "grad_norm": 9.467011451721191, "learning_rate": 1.5e-05, "loss": 0.8818, "step": 1473 }, { "epoch": 2.1899109792284865, "grad_norm": 11.624500274658203, "learning_rate": 1.5e-05, "loss": 0.8488, "step": 1476 }, { "epoch": 2.1943620178041545, "grad_norm": 4.053292751312256, "learning_rate": 1.5e-05, "loss": 0.8816, "step": 1479 }, { "epoch": 2.198813056379822, "grad_norm": 11.990628242492676, "learning_rate": 1.5e-05, "loss": 0.8632, "step": 1482 }, { "epoch": 2.2032640949554896, "grad_norm": 5.125602722167969, "learning_rate": 1.5e-05, "loss": 0.8546, "step": 1485 }, { "epoch": 2.207715133531157, "grad_norm": 12.101594924926758, "learning_rate": 1.5e-05, "loss": 0.7745, "step": 1488 }, { "epoch": 2.212166172106825, "grad_norm": 7.778988838195801, "learning_rate": 1.5e-05, "loss": 0.8345, "step": 1491 }, { "epoch": 2.2166172106824926, "grad_norm": 9.549551010131836, "learning_rate": 1.5e-05, "loss": 0.788, "step": 1494 }, { "epoch": 2.22106824925816, "grad_norm": 9.322439193725586, "learning_rate": 1.5e-05, "loss": 0.8578, "step": 1497 }, { "epoch": 2.2255192878338277, "grad_norm": 4.3148298263549805, "learning_rate": 1.5e-05, "loss": 0.8553, "step": 1500 }, { "epoch": 2.2299703264094957, "grad_norm": 8.451520919799805, "learning_rate": 1.5e-05, "loss": 0.8682, "step": 1503 }, { "epoch": 2.234421364985163, "grad_norm": 6.928389072418213, "learning_rate": 1.5e-05, "loss": 0.8307, "step": 1506 }, { "epoch": 2.2388724035608307, "grad_norm": 6.243911266326904, "learning_rate": 1.5e-05, "loss": 0.8033, "step": 1509 }, { "epoch": 2.2433234421364987, "grad_norm": 5.559226036071777, "learning_rate": 1.5e-05, "loss": 0.8383, "step": 1512 }, { "epoch": 2.2477744807121662, "grad_norm": 4.369063854217529, "learning_rate": 1.5e-05, "loss": 0.8009, "step": 1515 }, { "epoch": 2.2522255192878338, "grad_norm": 7.634733200073242, "learning_rate": 1.5e-05, "loss": 0.7984, "step": 1518 }, { "epoch": 2.2566765578635013, "grad_norm": 6.254056453704834, "learning_rate": 1.5e-05, "loss": 0.8802, "step": 1521 }, { "epoch": 2.2611275964391693, "grad_norm": 5.46887731552124, "learning_rate": 1.5e-05, "loss": 0.8178, "step": 1524 }, { "epoch": 2.265578635014837, "grad_norm": 16.369176864624023, "learning_rate": 1.5e-05, "loss": 0.8448, "step": 1527 }, { "epoch": 2.2700296735905043, "grad_norm": 7.460346221923828, "learning_rate": 1.5e-05, "loss": 0.8313, "step": 1530 }, { "epoch": 2.274480712166172, "grad_norm": 11.850396156311035, "learning_rate": 1.5e-05, "loss": 0.8652, "step": 1533 }, { "epoch": 2.27893175074184, "grad_norm": 7.525960445404053, "learning_rate": 1.5e-05, "loss": 0.9073, "step": 1536 }, { "epoch": 2.2833827893175074, "grad_norm": 6.6893391609191895, "learning_rate": 1.5e-05, "loss": 0.8186, "step": 1539 }, { "epoch": 2.287833827893175, "grad_norm": 8.127947807312012, "learning_rate": 1.5e-05, "loss": 0.8062, "step": 1542 }, { "epoch": 2.292284866468843, "grad_norm": 4.763282299041748, "learning_rate": 1.5e-05, "loss": 0.7815, "step": 1545 }, { "epoch": 2.2967359050445104, "grad_norm": 8.980463981628418, "learning_rate": 1.5e-05, "loss": 0.8257, "step": 1548 }, { "epoch": 2.301186943620178, "grad_norm": 5.902709484100342, "learning_rate": 1.5e-05, "loss": 0.8655, "step": 1551 }, { "epoch": 2.3056379821958455, "grad_norm": 9.1312255859375, "learning_rate": 1.5e-05, "loss": 0.8125, "step": 1554 }, { "epoch": 2.3100890207715135, "grad_norm": 11.039669036865234, "learning_rate": 1.5e-05, "loss": 0.8216, "step": 1557 }, { "epoch": 2.314540059347181, "grad_norm": 8.05490779876709, "learning_rate": 1.5e-05, "loss": 0.8396, "step": 1560 }, { "epoch": 2.3189910979228485, "grad_norm": 5.826514720916748, "learning_rate": 1.5e-05, "loss": 0.8757, "step": 1563 }, { "epoch": 2.3234421364985165, "grad_norm": 7.574896812438965, "learning_rate": 1.5e-05, "loss": 0.7736, "step": 1566 }, { "epoch": 2.327893175074184, "grad_norm": 6.01354455947876, "learning_rate": 1.5e-05, "loss": 0.8401, "step": 1569 }, { "epoch": 2.3323442136498516, "grad_norm": 6.542453289031982, "learning_rate": 1.5e-05, "loss": 0.8068, "step": 1572 }, { "epoch": 2.336795252225519, "grad_norm": 9.089799880981445, "learning_rate": 1.5e-05, "loss": 0.8628, "step": 1575 }, { "epoch": 2.341246290801187, "grad_norm": 6.65020227432251, "learning_rate": 1.5e-05, "loss": 0.8298, "step": 1578 }, { "epoch": 2.3456973293768546, "grad_norm": 6.966747760772705, "learning_rate": 1.5e-05, "loss": 0.8445, "step": 1581 }, { "epoch": 2.350148367952522, "grad_norm": 8.938283920288086, "learning_rate": 1.5e-05, "loss": 0.8404, "step": 1584 }, { "epoch": 2.35459940652819, "grad_norm": 7.403584957122803, "learning_rate": 1.5e-05, "loss": 0.8531, "step": 1587 }, { "epoch": 2.3590504451038576, "grad_norm": 5.015456199645996, "learning_rate": 1.5e-05, "loss": 0.8488, "step": 1590 }, { "epoch": 2.363501483679525, "grad_norm": 10.829426765441895, "learning_rate": 1.5e-05, "loss": 0.8292, "step": 1593 }, { "epoch": 2.3679525222551927, "grad_norm": 10.542449951171875, "learning_rate": 1.5e-05, "loss": 0.9097, "step": 1596 }, { "epoch": 2.3724035608308607, "grad_norm": 5.919280529022217, "learning_rate": 1.5e-05, "loss": 0.857, "step": 1599 }, { "epoch": 2.376854599406528, "grad_norm": 12.15097713470459, "learning_rate": 1.5e-05, "loss": 0.8439, "step": 1602 }, { "epoch": 2.3813056379821957, "grad_norm": 12.634583473205566, "learning_rate": 1.5e-05, "loss": 0.8838, "step": 1605 }, { "epoch": 2.3857566765578637, "grad_norm": 9.54806900024414, "learning_rate": 1.5e-05, "loss": 0.9001, "step": 1608 }, { "epoch": 2.3902077151335313, "grad_norm": 5.300346851348877, "learning_rate": 1.5e-05, "loss": 0.84, "step": 1611 }, { "epoch": 2.394658753709199, "grad_norm": 6.94837760925293, "learning_rate": 1.5e-05, "loss": 0.7643, "step": 1614 }, { "epoch": 2.3991097922848663, "grad_norm": 12.666196823120117, "learning_rate": 1.5e-05, "loss": 0.8797, "step": 1617 }, { "epoch": 2.4035608308605343, "grad_norm": 11.400768280029297, "learning_rate": 1.5e-05, "loss": 0.8755, "step": 1620 }, { "epoch": 2.408011869436202, "grad_norm": 9.61484146118164, "learning_rate": 1.5e-05, "loss": 0.8662, "step": 1623 }, { "epoch": 2.4124629080118694, "grad_norm": 7.2894110679626465, "learning_rate": 1.5e-05, "loss": 0.7903, "step": 1626 }, { "epoch": 2.4169139465875373, "grad_norm": 4.545930862426758, "learning_rate": 1.5e-05, "loss": 0.862, "step": 1629 }, { "epoch": 2.421364985163205, "grad_norm": 16.610261917114258, "learning_rate": 1.5e-05, "loss": 0.8769, "step": 1632 }, { "epoch": 2.4258160237388724, "grad_norm": 14.895539283752441, "learning_rate": 1.5e-05, "loss": 0.8884, "step": 1635 }, { "epoch": 2.43026706231454, "grad_norm": 6.956692218780518, "learning_rate": 1.5e-05, "loss": 0.8112, "step": 1638 }, { "epoch": 2.434718100890208, "grad_norm": 8.233116149902344, "learning_rate": 1.5e-05, "loss": 0.8461, "step": 1641 }, { "epoch": 2.4391691394658754, "grad_norm": 9.529879570007324, "learning_rate": 1.5e-05, "loss": 0.8649, "step": 1644 }, { "epoch": 2.443620178041543, "grad_norm": 7.341912269592285, "learning_rate": 1.5e-05, "loss": 0.8393, "step": 1647 }, { "epoch": 2.4480712166172105, "grad_norm": 7.184902667999268, "learning_rate": 1.5e-05, "loss": 0.8309, "step": 1650 }, { "epoch": 2.4525222551928785, "grad_norm": 14.62401008605957, "learning_rate": 1.5e-05, "loss": 0.8549, "step": 1653 }, { "epoch": 2.456973293768546, "grad_norm": 5.0358662605285645, "learning_rate": 1.5e-05, "loss": 0.8726, "step": 1656 }, { "epoch": 2.4614243323442135, "grad_norm": 3.3150253295898438, "learning_rate": 1.5e-05, "loss": 0.8276, "step": 1659 }, { "epoch": 2.465875370919881, "grad_norm": 8.129389762878418, "learning_rate": 1.5e-05, "loss": 0.8692, "step": 1662 }, { "epoch": 2.470326409495549, "grad_norm": 8.148307800292969, "learning_rate": 1.5e-05, "loss": 0.8713, "step": 1665 }, { "epoch": 2.4747774480712166, "grad_norm": 5.398252487182617, "learning_rate": 1.5e-05, "loss": 0.8615, "step": 1668 }, { "epoch": 2.479228486646884, "grad_norm": 4.667980194091797, "learning_rate": 1.5e-05, "loss": 0.8324, "step": 1671 }, { "epoch": 2.483679525222552, "grad_norm": 6.927284240722656, "learning_rate": 1.5e-05, "loss": 0.8589, "step": 1674 }, { "epoch": 2.4881305637982196, "grad_norm": 11.005992889404297, "learning_rate": 1.5e-05, "loss": 0.8564, "step": 1677 }, { "epoch": 2.492581602373887, "grad_norm": 16.280454635620117, "learning_rate": 1.5e-05, "loss": 0.8761, "step": 1680 }, { "epoch": 2.4970326409495547, "grad_norm": 8.563511848449707, "learning_rate": 1.5e-05, "loss": 0.8643, "step": 1683 }, { "epoch": 2.5014836795252227, "grad_norm": 17.003629684448242, "learning_rate": 1.5e-05, "loss": 0.8249, "step": 1686 }, { "epoch": 2.50593471810089, "grad_norm": 6.441048622131348, "learning_rate": 1.5e-05, "loss": 0.9072, "step": 1689 }, { "epoch": 2.5103857566765577, "grad_norm": 6.359565734863281, "learning_rate": 1.5e-05, "loss": 0.8495, "step": 1692 }, { "epoch": 2.5148367952522257, "grad_norm": 9.161234855651855, "learning_rate": 1.5e-05, "loss": 0.8319, "step": 1695 }, { "epoch": 2.5192878338278932, "grad_norm": 10.241405487060547, "learning_rate": 1.5e-05, "loss": 0.7881, "step": 1698 }, { "epoch": 2.5237388724035608, "grad_norm": 9.603667259216309, "learning_rate": 1.5e-05, "loss": 0.8681, "step": 1701 }, { "epoch": 2.5281899109792283, "grad_norm": 8.364523887634277, "learning_rate": 1.5e-05, "loss": 0.7961, "step": 1704 }, { "epoch": 2.5326409495548963, "grad_norm": 8.140654563903809, "learning_rate": 1.5e-05, "loss": 0.8807, "step": 1707 }, { "epoch": 2.537091988130564, "grad_norm": 12.45283031463623, "learning_rate": 1.5e-05, "loss": 0.8206, "step": 1710 }, { "epoch": 2.5415430267062313, "grad_norm": 7.65419864654541, "learning_rate": 1.5e-05, "loss": 0.8989, "step": 1713 }, { "epoch": 2.5459940652818993, "grad_norm": 4.040281295776367, "learning_rate": 1.5e-05, "loss": 0.8814, "step": 1716 }, { "epoch": 2.550445103857567, "grad_norm": 11.04344654083252, "learning_rate": 1.5e-05, "loss": 0.8469, "step": 1719 }, { "epoch": 2.5548961424332344, "grad_norm": 12.735292434692383, "learning_rate": 1.5e-05, "loss": 0.8847, "step": 1722 }, { "epoch": 2.559347181008902, "grad_norm": 7.6085686683654785, "learning_rate": 1.5e-05, "loss": 0.851, "step": 1725 }, { "epoch": 2.56379821958457, "grad_norm": 10.644798278808594, "learning_rate": 1.5e-05, "loss": 0.861, "step": 1728 }, { "epoch": 2.5682492581602374, "grad_norm": 7.817785263061523, "learning_rate": 1.5e-05, "loss": 0.8574, "step": 1731 }, { "epoch": 2.572700296735905, "grad_norm": 14.533990859985352, "learning_rate": 1.5e-05, "loss": 0.8883, "step": 1734 }, { "epoch": 2.577151335311573, "grad_norm": 9.98595905303955, "learning_rate": 1.5e-05, "loss": 0.7992, "step": 1737 }, { "epoch": 2.5816023738872405, "grad_norm": 13.704192161560059, "learning_rate": 1.5e-05, "loss": 0.8781, "step": 1740 }, { "epoch": 2.586053412462908, "grad_norm": 6.400760650634766, "learning_rate": 1.5e-05, "loss": 0.8248, "step": 1743 }, { "epoch": 2.5905044510385755, "grad_norm": 3.9698002338409424, "learning_rate": 1.5e-05, "loss": 0.7804, "step": 1746 }, { "epoch": 2.594955489614243, "grad_norm": 5.405271053314209, "learning_rate": 1.5e-05, "loss": 0.8691, "step": 1749 }, { "epoch": 2.599406528189911, "grad_norm": 7.326030731201172, "learning_rate": 1.5e-05, "loss": 0.7876, "step": 1752 }, { "epoch": 2.6038575667655786, "grad_norm": 12.94884967803955, "learning_rate": 1.5e-05, "loss": 0.7877, "step": 1755 }, { "epoch": 2.6083086053412465, "grad_norm": 12.542633056640625, "learning_rate": 1.5e-05, "loss": 0.8311, "step": 1758 }, { "epoch": 2.612759643916914, "grad_norm": 12.357892036437988, "learning_rate": 1.5e-05, "loss": 0.8398, "step": 1761 }, { "epoch": 2.6172106824925816, "grad_norm": 10.735803604125977, "learning_rate": 1.5e-05, "loss": 0.8557, "step": 1764 }, { "epoch": 2.621661721068249, "grad_norm": 7.849278450012207, "learning_rate": 1.5e-05, "loss": 0.8559, "step": 1767 }, { "epoch": 2.6261127596439167, "grad_norm": 7.459741592407227, "learning_rate": 1.5e-05, "loss": 0.8382, "step": 1770 }, { "epoch": 2.6305637982195846, "grad_norm": 9.422908782958984, "learning_rate": 1.5e-05, "loss": 0.8586, "step": 1773 }, { "epoch": 2.635014836795252, "grad_norm": 6.327311038970947, "learning_rate": 1.5e-05, "loss": 0.8232, "step": 1776 }, { "epoch": 2.63946587537092, "grad_norm": 10.571976661682129, "learning_rate": 1.5e-05, "loss": 0.8694, "step": 1779 }, { "epoch": 2.6439169139465877, "grad_norm": 17.467416763305664, "learning_rate": 1.5e-05, "loss": 0.8693, "step": 1782 }, { "epoch": 2.648367952522255, "grad_norm": 6.911043167114258, "learning_rate": 1.5e-05, "loss": 0.8643, "step": 1785 }, { "epoch": 2.6528189910979227, "grad_norm": 10.180506706237793, "learning_rate": 1.5e-05, "loss": 0.832, "step": 1788 }, { "epoch": 2.6572700296735903, "grad_norm": 5.487372398376465, "learning_rate": 1.5e-05, "loss": 0.8211, "step": 1791 }, { "epoch": 2.6617210682492582, "grad_norm": 8.488285064697266, "learning_rate": 1.5e-05, "loss": 0.8174, "step": 1794 }, { "epoch": 2.666172106824926, "grad_norm": 14.654566764831543, "learning_rate": 1.5e-05, "loss": 0.8443, "step": 1797 }, { "epoch": 2.6706231454005933, "grad_norm": 8.551965713500977, "learning_rate": 1.5e-05, "loss": 0.8778, "step": 1800 }, { "epoch": 2.6750741839762613, "grad_norm": 6.797290802001953, "learning_rate": 1.5e-05, "loss": 0.859, "step": 1803 }, { "epoch": 2.679525222551929, "grad_norm": 4.413401126861572, "learning_rate": 1.5e-05, "loss": 0.9004, "step": 1806 }, { "epoch": 2.6839762611275964, "grad_norm": 8.826961517333984, "learning_rate": 1.5e-05, "loss": 0.8618, "step": 1809 }, { "epoch": 2.688427299703264, "grad_norm": 6.915543556213379, "learning_rate": 1.5e-05, "loss": 0.8242, "step": 1812 }, { "epoch": 2.692878338278932, "grad_norm": 7.802698612213135, "learning_rate": 1.5e-05, "loss": 0.8373, "step": 1815 }, { "epoch": 2.6973293768545994, "grad_norm": 4.345271587371826, "learning_rate": 1.5e-05, "loss": 0.8212, "step": 1818 }, { "epoch": 2.701780415430267, "grad_norm": 8.312252044677734, "learning_rate": 1.5e-05, "loss": 0.807, "step": 1821 }, { "epoch": 2.706231454005935, "grad_norm": 5.19842004776001, "learning_rate": 1.5e-05, "loss": 0.8513, "step": 1824 }, { "epoch": 2.7106824925816024, "grad_norm": 14.573792457580566, "learning_rate": 1.5e-05, "loss": 0.8215, "step": 1827 }, { "epoch": 2.71513353115727, "grad_norm": 6.5800323486328125, "learning_rate": 1.5e-05, "loss": 0.8826, "step": 1830 }, { "epoch": 2.7195845697329375, "grad_norm": 14.643542289733887, "learning_rate": 1.5e-05, "loss": 0.7716, "step": 1833 }, { "epoch": 2.7240356083086055, "grad_norm": 12.744583129882812, "learning_rate": 1.5e-05, "loss": 0.8621, "step": 1836 }, { "epoch": 2.728486646884273, "grad_norm": 12.435503005981445, "learning_rate": 1.5e-05, "loss": 0.8908, "step": 1839 }, { "epoch": 2.7329376854599405, "grad_norm": 6.115302562713623, "learning_rate": 1.5e-05, "loss": 0.8921, "step": 1842 }, { "epoch": 2.7373887240356085, "grad_norm": 14.632364273071289, "learning_rate": 1.5e-05, "loss": 0.8535, "step": 1845 }, { "epoch": 2.741839762611276, "grad_norm": 5.676476001739502, "learning_rate": 1.5e-05, "loss": 0.8872, "step": 1848 }, { "epoch": 2.7462908011869436, "grad_norm": 12.727757453918457, "learning_rate": 1.5e-05, "loss": 0.8836, "step": 1851 }, { "epoch": 2.750741839762611, "grad_norm": 5.729983329772949, "learning_rate": 1.5e-05, "loss": 0.8453, "step": 1854 }, { "epoch": 2.755192878338279, "grad_norm": 8.607340812683105, "learning_rate": 1.5e-05, "loss": 0.8795, "step": 1857 }, { "epoch": 2.7596439169139466, "grad_norm": 7.55084228515625, "learning_rate": 1.5e-05, "loss": 0.8788, "step": 1860 }, { "epoch": 2.764094955489614, "grad_norm": 10.093510627746582, "learning_rate": 1.5e-05, "loss": 0.8816, "step": 1863 }, { "epoch": 2.768545994065282, "grad_norm": 8.678201675415039, "learning_rate": 1.5e-05, "loss": 0.867, "step": 1866 }, { "epoch": 2.7729970326409497, "grad_norm": 6.614081859588623, "learning_rate": 1.5e-05, "loss": 0.887, "step": 1869 }, { "epoch": 2.777448071216617, "grad_norm": 6.593700408935547, "learning_rate": 1.5e-05, "loss": 0.8929, "step": 1872 }, { "epoch": 2.7818991097922847, "grad_norm": 5.097481727600098, "learning_rate": 1.5e-05, "loss": 0.8629, "step": 1875 }, { "epoch": 2.7863501483679523, "grad_norm": 5.016817569732666, "learning_rate": 1.5e-05, "loss": 0.8413, "step": 1878 }, { "epoch": 2.7908011869436202, "grad_norm": 7.502362251281738, "learning_rate": 1.5e-05, "loss": 0.8945, "step": 1881 }, { "epoch": 2.7952522255192878, "grad_norm": 4.612887859344482, "learning_rate": 1.5e-05, "loss": 0.8811, "step": 1884 }, { "epoch": 2.7997032640949557, "grad_norm": 5.493846893310547, "learning_rate": 1.5e-05, "loss": 0.8757, "step": 1887 }, { "epoch": 2.8041543026706233, "grad_norm": 8.605670928955078, "learning_rate": 1.5e-05, "loss": 0.8047, "step": 1890 }, { "epoch": 2.808605341246291, "grad_norm": 12.178396224975586, "learning_rate": 1.5e-05, "loss": 0.8905, "step": 1893 }, { "epoch": 2.8130563798219583, "grad_norm": 8.929186820983887, "learning_rate": 1.5e-05, "loss": 0.8869, "step": 1896 }, { "epoch": 2.817507418397626, "grad_norm": 6.589859962463379, "learning_rate": 1.5e-05, "loss": 0.8473, "step": 1899 }, { "epoch": 2.821958456973294, "grad_norm": 10.543880462646484, "learning_rate": 1.5e-05, "loss": 0.8613, "step": 1902 }, { "epoch": 2.8264094955489614, "grad_norm": 8.176854133605957, "learning_rate": 1.5e-05, "loss": 0.8429, "step": 1905 }, { "epoch": 2.8308605341246293, "grad_norm": 5.652864456176758, "learning_rate": 1.5e-05, "loss": 0.8754, "step": 1908 }, { "epoch": 2.835311572700297, "grad_norm": 8.587650299072266, "learning_rate": 1.5e-05, "loss": 0.9134, "step": 1911 }, { "epoch": 2.8397626112759644, "grad_norm": 5.4106974601745605, "learning_rate": 1.5e-05, "loss": 0.8334, "step": 1914 }, { "epoch": 2.844213649851632, "grad_norm": 6.253225803375244, "learning_rate": 1.5e-05, "loss": 0.901, "step": 1917 }, { "epoch": 2.8486646884272995, "grad_norm": 8.90531063079834, "learning_rate": 1.5e-05, "loss": 0.8316, "step": 1920 }, { "epoch": 2.8531157270029674, "grad_norm": 4.412182807922363, "learning_rate": 1.5e-05, "loss": 0.8384, "step": 1923 }, { "epoch": 2.857566765578635, "grad_norm": 6.357685565948486, "learning_rate": 1.5e-05, "loss": 0.8903, "step": 1926 }, { "epoch": 2.8620178041543025, "grad_norm": 7.667703628540039, "learning_rate": 1.5e-05, "loss": 0.8455, "step": 1929 }, { "epoch": 2.8664688427299705, "grad_norm": 10.909478187561035, "learning_rate": 1.5e-05, "loss": 0.8399, "step": 1932 }, { "epoch": 2.870919881305638, "grad_norm": 7.347332954406738, "learning_rate": 1.5e-05, "loss": 0.8198, "step": 1935 }, { "epoch": 2.8753709198813056, "grad_norm": 7.22322416305542, "learning_rate": 1.5e-05, "loss": 0.9074, "step": 1938 }, { "epoch": 2.879821958456973, "grad_norm": 5.389438152313232, "learning_rate": 1.5e-05, "loss": 0.846, "step": 1941 }, { "epoch": 2.884272997032641, "grad_norm": 8.13633918762207, "learning_rate": 1.5e-05, "loss": 0.8615, "step": 1944 }, { "epoch": 2.8887240356083086, "grad_norm": 7.694199085235596, "learning_rate": 1.5e-05, "loss": 0.7791, "step": 1947 }, { "epoch": 2.893175074183976, "grad_norm": 10.673176765441895, "learning_rate": 1.5e-05, "loss": 0.8234, "step": 1950 }, { "epoch": 2.897626112759644, "grad_norm": 7.695837020874023, "learning_rate": 1.5e-05, "loss": 0.8471, "step": 1953 }, { "epoch": 2.9020771513353116, "grad_norm": 11.210200309753418, "learning_rate": 1.5e-05, "loss": 0.8561, "step": 1956 }, { "epoch": 2.906528189910979, "grad_norm": 13.856889724731445, "learning_rate": 1.5e-05, "loss": 0.8177, "step": 1959 }, { "epoch": 2.9109792284866467, "grad_norm": 6.0733489990234375, "learning_rate": 1.5e-05, "loss": 0.8885, "step": 1962 }, { "epoch": 2.9154302670623147, "grad_norm": 7.38955545425415, "learning_rate": 1.5e-05, "loss": 0.8502, "step": 1965 }, { "epoch": 2.919881305637982, "grad_norm": 13.866927146911621, "learning_rate": 1.5e-05, "loss": 0.8622, "step": 1968 }, { "epoch": 2.9243323442136497, "grad_norm": 6.984748363494873, "learning_rate": 1.5e-05, "loss": 0.9041, "step": 1971 }, { "epoch": 2.9287833827893177, "grad_norm": 13.53242301940918, "learning_rate": 1.5e-05, "loss": 0.8367, "step": 1974 }, { "epoch": 2.9332344213649852, "grad_norm": 6.702526092529297, "learning_rate": 1.5e-05, "loss": 0.8343, "step": 1977 }, { "epoch": 2.9376854599406528, "grad_norm": 5.4899678230285645, "learning_rate": 1.5e-05, "loss": 0.8873, "step": 1980 }, { "epoch": 2.9421364985163203, "grad_norm": 8.483062744140625, "learning_rate": 1.5e-05, "loss": 0.8078, "step": 1983 }, { "epoch": 2.9465875370919883, "grad_norm": 7.6923065185546875, "learning_rate": 1.5e-05, "loss": 0.8316, "step": 1986 }, { "epoch": 2.951038575667656, "grad_norm": 4.571675777435303, "learning_rate": 1.5e-05, "loss": 0.882, "step": 1989 }, { "epoch": 2.9554896142433233, "grad_norm": 8.073565483093262, "learning_rate": 1.5e-05, "loss": 0.8233, "step": 1992 }, { "epoch": 2.9599406528189913, "grad_norm": 13.49317455291748, "learning_rate": 1.5e-05, "loss": 0.9236, "step": 1995 }, { "epoch": 2.964391691394659, "grad_norm": 6.603755474090576, "learning_rate": 1.5e-05, "loss": 0.8692, "step": 1998 }, { "epoch": 2.9688427299703264, "grad_norm": 8.523149490356445, "learning_rate": 1.5e-05, "loss": 0.8333, "step": 2001 }, { "epoch": 2.973293768545994, "grad_norm": 9.513497352600098, "learning_rate": 1.5e-05, "loss": 0.8879, "step": 2004 }, { "epoch": 2.9777448071216615, "grad_norm": 8.058304786682129, "learning_rate": 1.5e-05, "loss": 0.8539, "step": 2007 }, { "epoch": 2.9821958456973294, "grad_norm": 6.942746162414551, "learning_rate": 1.5e-05, "loss": 0.8532, "step": 2010 }, { "epoch": 2.986646884272997, "grad_norm": 7.1598639488220215, "learning_rate": 1.5e-05, "loss": 0.8094, "step": 2013 }, { "epoch": 2.991097922848665, "grad_norm": 7.722570419311523, "learning_rate": 1.5e-05, "loss": 0.85, "step": 2016 }, { "epoch": 2.9955489614243325, "grad_norm": 7.609329700469971, "learning_rate": 1.5e-05, "loss": 0.9311, "step": 2019 }, { "epoch": 3.0, "grad_norm": 6.5114426612854, "learning_rate": 1.5e-05, "loss": 0.8043, "step": 2022 }, { "epoch": 3.0, "eval_loss": 2.056363105773926, "eval_runtime": 552.8503, "eval_samples_per_second": 2.789, "eval_steps_per_second": 0.349, "step": 2022 } ], "logging_steps": 3, "max_steps": 6740, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }