| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 2540, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03937007874015748, | |
| "grad_norm": 9.399362564086914, | |
| "learning_rate": 7.086614173228347e-06, | |
| "loss": 1.4557, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07874015748031496, | |
| "grad_norm": 2.942493438720703, | |
| "learning_rate": 1.4960629921259845e-05, | |
| "loss": 0.4296, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11811023622047244, | |
| "grad_norm": 3.1724367141723633, | |
| "learning_rate": 2.283464566929134e-05, | |
| "loss": 0.3657, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 1.5789762735366821, | |
| "learning_rate": 3.070866141732284e-05, | |
| "loss": 0.3486, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1968503937007874, | |
| "grad_norm": 2.097869396209717, | |
| "learning_rate": 3.858267716535433e-05, | |
| "loss": 0.3044, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23622047244094488, | |
| "grad_norm": 3.880457639694214, | |
| "learning_rate": 4.645669291338583e-05, | |
| "loss": 0.3367, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2755905511811024, | |
| "grad_norm": 2.8253917694091797, | |
| "learning_rate": 5.433070866141733e-05, | |
| "loss": 0.3126, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 3.822925090789795, | |
| "learning_rate": 6.220472440944882e-05, | |
| "loss": 0.3004, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3543307086614173, | |
| "grad_norm": 1.3659324645996094, | |
| "learning_rate": 7.007874015748031e-05, | |
| "loss": 0.2605, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3937007874015748, | |
| "grad_norm": 1.7165173292160034, | |
| "learning_rate": 7.795275590551181e-05, | |
| "loss": 0.1676, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4330708661417323, | |
| "grad_norm": 1.704687476158142, | |
| "learning_rate": 8.582677165354331e-05, | |
| "loss": 0.1404, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 1.3101590871810913, | |
| "learning_rate": 9.370078740157481e-05, | |
| "loss": 0.1322, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5118110236220472, | |
| "grad_norm": 1.6621087789535522, | |
| "learning_rate": 9.999983049408561e-05, | |
| "loss": 0.1242, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5511811023622047, | |
| "grad_norm": 0.9743478298187256, | |
| "learning_rate": 9.999389790775648e-05, | |
| "loss": 0.1027, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5905511811023622, | |
| "grad_norm": 1.9478999376296997, | |
| "learning_rate": 9.997949117496292e-05, | |
| "loss": 0.1174, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.9509850740432739, | |
| "learning_rate": 9.995661273769822e-05, | |
| "loss": 0.1015, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6692913385826772, | |
| "grad_norm": 0.9505985379219055, | |
| "learning_rate": 9.992526647394022e-05, | |
| "loss": 0.102, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7086614173228346, | |
| "grad_norm": 1.489611268043518, | |
| "learning_rate": 9.988545769699399e-05, | |
| "loss": 0.097, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7480314960629921, | |
| "grad_norm": 1.1149543523788452, | |
| "learning_rate": 9.983719315459114e-05, | |
| "loss": 0.0925, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 1.0860552787780762, | |
| "learning_rate": 9.978048102774613e-05, | |
| "loss": 0.0964, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8267716535433071, | |
| "grad_norm": 1.2707302570343018, | |
| "learning_rate": 9.971533092936954e-05, | |
| "loss": 0.0844, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8661417322834646, | |
| "grad_norm": 1.1820255517959595, | |
| "learning_rate": 9.964175390263856e-05, | |
| "loss": 0.0805, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.905511811023622, | |
| "grad_norm": 1.3937278985977173, | |
| "learning_rate": 9.955976241912535e-05, | |
| "loss": 0.0871, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.9341478943824768, | |
| "learning_rate": 9.946937037668275e-05, | |
| "loss": 0.0826, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.984251968503937, | |
| "grad_norm": 1.7772321701049805, | |
| "learning_rate": 9.937059309708885e-05, | |
| "loss": 0.0873, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0236220472440944, | |
| "grad_norm": 1.2695393562316895, | |
| "learning_rate": 9.926344732344967e-05, | |
| "loss": 0.0794, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0629921259842519, | |
| "grad_norm": 1.1093697547912598, | |
| "learning_rate": 9.914795121736128e-05, | |
| "loss": 0.0758, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1023622047244095, | |
| "grad_norm": 0.8029543161392212, | |
| "learning_rate": 9.902412435583128e-05, | |
| "loss": 0.0678, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.141732283464567, | |
| "grad_norm": 0.7547488808631897, | |
| "learning_rate": 9.88919877279604e-05, | |
| "loss": 0.0761, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1811023622047245, | |
| "grad_norm": 0.8116704225540161, | |
| "learning_rate": 9.875156373138489e-05, | |
| "loss": 0.057, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.220472440944882, | |
| "grad_norm": 0.8954646587371826, | |
| "learning_rate": 9.86028761684799e-05, | |
| "loss": 0.0738, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2598425196850394, | |
| "grad_norm": 1.016405463218689, | |
| "learning_rate": 9.844595024232495e-05, | |
| "loss": 0.0901, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2992125984251968, | |
| "grad_norm": 1.395342469215393, | |
| "learning_rate": 9.828081255243198e-05, | |
| "loss": 0.0796, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3385826771653544, | |
| "grad_norm": 0.8092917203903198, | |
| "learning_rate": 9.81074910902365e-05, | |
| "loss": 0.0883, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3779527559055118, | |
| "grad_norm": 1.1169452667236328, | |
| "learning_rate": 9.792601523435307e-05, | |
| "loss": 0.0748, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4173228346456692, | |
| "grad_norm": 0.7994294166564941, | |
| "learning_rate": 9.773641574559546e-05, | |
| "loss": 0.0862, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4566929133858268, | |
| "grad_norm": 0.7973235249519348, | |
| "learning_rate": 9.753872476176254e-05, | |
| "loss": 0.0735, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4960629921259843, | |
| "grad_norm": 0.9219651818275452, | |
| "learning_rate": 9.73329757921909e-05, | |
| "loss": 0.077, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.5354330708661417, | |
| "grad_norm": 1.1715006828308105, | |
| "learning_rate": 9.711920371207484e-05, | |
| "loss": 0.0691, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.574803149606299, | |
| "grad_norm": 0.7752212882041931, | |
| "learning_rate": 9.68974447565549e-05, | |
| "loss": 0.0669, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6141732283464567, | |
| "grad_norm": 1.0260776281356812, | |
| "learning_rate": 9.666773651457588e-05, | |
| "loss": 0.0623, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6535433070866141, | |
| "grad_norm": 0.8338336944580078, | |
| "learning_rate": 9.643011792251538e-05, | |
| "loss": 0.0699, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6929133858267718, | |
| "grad_norm": 0.8776105642318726, | |
| "learning_rate": 9.618462925758392e-05, | |
| "loss": 0.0653, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7322834645669292, | |
| "grad_norm": 0.6896973252296448, | |
| "learning_rate": 9.593131213099789e-05, | |
| "loss": 0.0586, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7716535433070866, | |
| "grad_norm": 1.0852605104446411, | |
| "learning_rate": 9.567020948092616e-05, | |
| "loss": 0.0673, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.811023622047244, | |
| "grad_norm": 1.0203490257263184, | |
| "learning_rate": 9.540136556521203e-05, | |
| "loss": 0.0663, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8503937007874016, | |
| "grad_norm": 0.774488091468811, | |
| "learning_rate": 9.512482595387132e-05, | |
| "loss": 0.0609, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.889763779527559, | |
| "grad_norm": 0.5737660527229309, | |
| "learning_rate": 9.484063752136805e-05, | |
| "loss": 0.0606, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9291338582677167, | |
| "grad_norm": 1.0153898000717163, | |
| "learning_rate": 9.454884843866912e-05, | |
| "loss": 0.0737, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.968503937007874, | |
| "grad_norm": 0.7526334524154663, | |
| "learning_rate": 9.424950816507909e-05, | |
| "loss": 0.0641, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.0078740157480315, | |
| "grad_norm": 0.5760018825531006, | |
| "learning_rate": 9.394266743985671e-05, | |
| "loss": 0.0674, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.047244094488189, | |
| "grad_norm": 0.70269775390625, | |
| "learning_rate": 9.36283782736144e-05, | |
| "loss": 0.0631, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.0866141732283463, | |
| "grad_norm": 0.8864635229110718, | |
| "learning_rate": 9.330669393950219e-05, | |
| "loss": 0.0654, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.1259842519685037, | |
| "grad_norm": 0.7043759226799011, | |
| "learning_rate": 9.297766896417793e-05, | |
| "loss": 0.0657, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.1653543307086616, | |
| "grad_norm": 0.6329500675201416, | |
| "learning_rate": 9.264135911856462e-05, | |
| "loss": 0.0707, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.204724409448819, | |
| "grad_norm": 0.4031962752342224, | |
| "learning_rate": 9.22978214083971e-05, | |
| "loss": 0.0528, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.2440944881889764, | |
| "grad_norm": 0.5401821136474609, | |
| "learning_rate": 9.194711406455945e-05, | |
| "loss": 0.0654, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.283464566929134, | |
| "grad_norm": 0.713798999786377, | |
| "learning_rate": 9.158929653321451e-05, | |
| "loss": 0.0555, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.322834645669291, | |
| "grad_norm": 0.4728735387325287, | |
| "learning_rate": 9.122442946572768e-05, | |
| "loss": 0.0552, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.362204724409449, | |
| "grad_norm": 0.7359452843666077, | |
| "learning_rate": 9.085257470838619e-05, | |
| "loss": 0.0677, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.4015748031496065, | |
| "grad_norm": 0.6030870676040649, | |
| "learning_rate": 9.047379529191594e-05, | |
| "loss": 0.053, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.440944881889764, | |
| "grad_norm": 0.5791817903518677, | |
| "learning_rate": 9.008815542079766e-05, | |
| "loss": 0.0493, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.4803149606299213, | |
| "grad_norm": 0.8772215247154236, | |
| "learning_rate": 8.969572046238389e-05, | |
| "loss": 0.0721, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.5196850393700787, | |
| "grad_norm": 0.8733668923377991, | |
| "learning_rate": 8.929655693581904e-05, | |
| "loss": 0.0597, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.559055118110236, | |
| "grad_norm": 1.0022321939468384, | |
| "learning_rate": 8.889073250076421e-05, | |
| "loss": 0.0659, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.5984251968503935, | |
| "grad_norm": 0.7206939458847046, | |
| "learning_rate": 8.84783159459285e-05, | |
| "loss": 0.0452, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.637795275590551, | |
| "grad_norm": 0.8875113725662231, | |
| "learning_rate": 8.805937717740918e-05, | |
| "loss": 0.0539, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.677165354330709, | |
| "grad_norm": 0.5767335295677185, | |
| "learning_rate": 8.763398720684232e-05, | |
| "loss": 0.0503, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.716535433070866, | |
| "grad_norm": 0.5727648138999939, | |
| "learning_rate": 8.72022181393661e-05, | |
| "loss": 0.0457, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.7559055118110236, | |
| "grad_norm": 0.8125827312469482, | |
| "learning_rate": 8.676414316139863e-05, | |
| "loss": 0.0607, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.795275590551181, | |
| "grad_norm": 0.6720311641693115, | |
| "learning_rate": 8.631983652823267e-05, | |
| "loss": 0.0665, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.8346456692913384, | |
| "grad_norm": 0.6637985706329346, | |
| "learning_rate": 8.586937355144908e-05, | |
| "loss": 0.068, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.8740157480314963, | |
| "grad_norm": 0.7840360999107361, | |
| "learning_rate": 8.541283058615124e-05, | |
| "loss": 0.0561, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.9133858267716537, | |
| "grad_norm": 0.44171687960624695, | |
| "learning_rate": 8.495028501802251e-05, | |
| "loss": 0.0534, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.952755905511811, | |
| "grad_norm": 0.4313163459300995, | |
| "learning_rate": 8.448181525020921e-05, | |
| "loss": 0.0391, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.9921259842519685, | |
| "grad_norm": 0.7261826395988464, | |
| "learning_rate": 8.400750069003086e-05, | |
| "loss": 0.0486, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.031496062992126, | |
| "grad_norm": 0.4469556212425232, | |
| "learning_rate": 8.352742173552046e-05, | |
| "loss": 0.0511, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.0708661417322833, | |
| "grad_norm": 0.9129867553710938, | |
| "learning_rate": 8.304165976179667e-05, | |
| "loss": 0.0533, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.1102362204724407, | |
| "grad_norm": 1.2041122913360596, | |
| "learning_rate": 8.255029710727048e-05, | |
| "loss": 0.0671, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.1496062992125986, | |
| "grad_norm": 0.7420069575309753, | |
| "learning_rate": 8.20534170596885e-05, | |
| "loss": 0.0685, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.188976377952756, | |
| "grad_norm": 0.3230190575122833, | |
| "learning_rate": 8.155110384201544e-05, | |
| "loss": 0.0647, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.2283464566929134, | |
| "grad_norm": 0.6603342890739441, | |
| "learning_rate": 8.104344259815794e-05, | |
| "loss": 0.0558, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.267716535433071, | |
| "grad_norm": 0.5632081031799316, | |
| "learning_rate": 8.053051937853248e-05, | |
| "loss": 0.0558, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.3070866141732282, | |
| "grad_norm": 0.7754299640655518, | |
| "learning_rate": 8.001242112547942e-05, | |
| "loss": 0.0632, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.3464566929133857, | |
| "grad_norm": 0.7823946475982666, | |
| "learning_rate": 7.948923565852598e-05, | |
| "loss": 0.0662, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.3858267716535435, | |
| "grad_norm": 0.7399844527244568, | |
| "learning_rate": 7.896105165950059e-05, | |
| "loss": 0.052, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.425196850393701, | |
| "grad_norm": 0.8208476305007935, | |
| "learning_rate": 7.842795865750088e-05, | |
| "loss": 0.0486, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.4645669291338583, | |
| "grad_norm": 0.5400993227958679, | |
| "learning_rate": 7.789004701371825e-05, | |
| "loss": 0.0443, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.5039370078740157, | |
| "grad_norm": 0.6949036717414856, | |
| "learning_rate": 7.734740790612136e-05, | |
| "loss": 0.0597, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.543307086614173, | |
| "grad_norm": 1.0321848392486572, | |
| "learning_rate": 7.680013331400098e-05, | |
| "loss": 0.0446, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.5826771653543306, | |
| "grad_norm": 0.5193206071853638, | |
| "learning_rate": 7.624831600237937e-05, | |
| "loss": 0.0499, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.622047244094488, | |
| "grad_norm": 0.755699872970581, | |
| "learning_rate": 7.569204950628605e-05, | |
| "loss": 0.0595, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.661417322834646, | |
| "grad_norm": 0.4758411645889282, | |
| "learning_rate": 7.513142811490356e-05, | |
| "loss": 0.0403, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.7007874015748032, | |
| "grad_norm": 0.9744377732276917, | |
| "learning_rate": 7.456654685558481e-05, | |
| "loss": 0.0566, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.7401574803149606, | |
| "grad_norm": 0.46791282296180725, | |
| "learning_rate": 7.399750147774575e-05, | |
| "loss": 0.0445, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.779527559055118, | |
| "grad_norm": 0.4000394642353058, | |
| "learning_rate": 7.34243884366355e-05, | |
| "loss": 0.0577, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.8188976377952755, | |
| "grad_norm": 0.6318042874336243, | |
| "learning_rate": 7.28473048769868e-05, | |
| "loss": 0.0491, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.8582677165354333, | |
| "grad_norm": 0.9635873436927795, | |
| "learning_rate": 7.226634861654965e-05, | |
| "loss": 0.0501, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.8976377952755907, | |
| "grad_norm": 0.7494231462478638, | |
| "learning_rate": 7.168161812951084e-05, | |
| "loss": 0.0511, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.937007874015748, | |
| "grad_norm": 0.6621044874191284, | |
| "learning_rate": 7.109321252980218e-05, | |
| "loss": 0.0429, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.9763779527559056, | |
| "grad_norm": 0.6507459878921509, | |
| "learning_rate": 7.05012315543004e-05, | |
| "loss": 0.0514, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 4.015748031496063, | |
| "grad_norm": 0.6397859454154968, | |
| "learning_rate": 6.990577554592134e-05, | |
| "loss": 0.051, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 4.05511811023622, | |
| "grad_norm": 0.8459829688072205, | |
| "learning_rate": 6.930694543661149e-05, | |
| "loss": 0.0492, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 4.094488188976378, | |
| "grad_norm": 0.6369995474815369, | |
| "learning_rate": 6.870484273023968e-05, | |
| "loss": 0.0447, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 4.133858267716535, | |
| "grad_norm": 0.6142792701721191, | |
| "learning_rate": 6.809956948539166e-05, | |
| "loss": 0.044, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 4.173228346456693, | |
| "grad_norm": 0.5256998538970947, | |
| "learning_rate": 6.749122829807103e-05, | |
| "loss": 0.0427, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 4.21259842519685, | |
| "grad_norm": 0.7650443911552429, | |
| "learning_rate": 6.687992228430872e-05, | |
| "loss": 0.0525, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 4.251968503937007, | |
| "grad_norm": 0.6876934170722961, | |
| "learning_rate": 6.62657550626844e-05, | |
| "loss": 0.0385, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 4.291338582677166, | |
| "grad_norm": 0.7614730000495911, | |
| "learning_rate": 6.564883073676287e-05, | |
| "loss": 0.0543, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 4.330708661417323, | |
| "grad_norm": 0.591896653175354, | |
| "learning_rate": 6.502925387744807e-05, | |
| "loss": 0.044, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.3700787401574805, | |
| "grad_norm": 0.8287089467048645, | |
| "learning_rate": 6.440712950525791e-05, | |
| "loss": 0.0427, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 4.409448818897638, | |
| "grad_norm": 0.8359081745147705, | |
| "learning_rate": 6.3782563072523e-05, | |
| "loss": 0.0513, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 4.448818897637795, | |
| "grad_norm": 0.4965924322605133, | |
| "learning_rate": 6.315566044551197e-05, | |
| "loss": 0.0503, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 4.488188976377953, | |
| "grad_norm": 0.5000588297843933, | |
| "learning_rate": 6.252652788648691e-05, | |
| "loss": 0.0348, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 4.52755905511811, | |
| "grad_norm": 0.5434101819992065, | |
| "learning_rate": 6.18952720356914e-05, | |
| "loss": 0.0409, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 4.566929133858268, | |
| "grad_norm": 0.6852266788482666, | |
| "learning_rate": 6.126199989327462e-05, | |
| "loss": 0.0437, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 4.606299212598425, | |
| "grad_norm": 0.684528648853302, | |
| "learning_rate": 6.062681880115453e-05, | |
| "loss": 0.0447, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.645669291338582, | |
| "grad_norm": 0.6656462550163269, | |
| "learning_rate": 5.998983642482296e-05, | |
| "loss": 0.0429, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.68503937007874, | |
| "grad_norm": 0.7268936634063721, | |
| "learning_rate": 5.935116073509592e-05, | |
| "loss": 0.0478, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.724409448818898, | |
| "grad_norm": 0.5984519720077515, | |
| "learning_rate": 5.871089998981214e-05, | |
| "loss": 0.038, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.7637795275590555, | |
| "grad_norm": 0.6121963858604431, | |
| "learning_rate": 5.8069162715483e-05, | |
| "loss": 0.0388, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.803149606299213, | |
| "grad_norm": 0.47043073177337646, | |
| "learning_rate": 5.742605768889693e-05, | |
| "loss": 0.0355, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 4.84251968503937, | |
| "grad_norm": 0.5659105181694031, | |
| "learning_rate": 5.6781693918681275e-05, | |
| "loss": 0.0434, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 4.881889763779528, | |
| "grad_norm": 0.5114152431488037, | |
| "learning_rate": 5.613618062682502e-05, | |
| "loss": 0.0379, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 4.921259842519685, | |
| "grad_norm": 0.6941166520118713, | |
| "learning_rate": 5.5489627230165176e-05, | |
| "loss": 0.046, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.960629921259843, | |
| "grad_norm": 0.6256818771362305, | |
| "learning_rate": 5.48421433218403e-05, | |
| "loss": 0.0505, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.8666470646858215, | |
| "learning_rate": 5.419383865271402e-05, | |
| "loss": 0.0419, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 5.039370078740157, | |
| "grad_norm": 1.043869972229004, | |
| "learning_rate": 5.354482311277193e-05, | |
| "loss": 0.0483, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 5.078740157480315, | |
| "grad_norm": 0.4656646251678467, | |
| "learning_rate": 5.289520671249479e-05, | |
| "loss": 0.0333, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 5.118110236220472, | |
| "grad_norm": 0.45656928420066833, | |
| "learning_rate": 5.224509956421133e-05, | |
| "loss": 0.0373, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 5.15748031496063, | |
| "grad_norm": 0.7461910247802734, | |
| "learning_rate": 5.159461186343385e-05, | |
| "loss": 0.0427, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 5.196850393700787, | |
| "grad_norm": 0.646958589553833, | |
| "learning_rate": 5.094385387017967e-05, | |
| "loss": 0.0447, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 5.2362204724409445, | |
| "grad_norm": 0.2877052128314972, | |
| "learning_rate": 5.02929358902817e-05, | |
| "loss": 0.036, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 5.275590551181103, | |
| "grad_norm": 0.5020745992660522, | |
| "learning_rate": 4.964196825669112e-05, | |
| "loss": 0.0485, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 5.31496062992126, | |
| "grad_norm": 0.494816392660141, | |
| "learning_rate": 4.899106131077562e-05, | |
| "loss": 0.0446, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 5.354330708661418, | |
| "grad_norm": 0.4856385290622711, | |
| "learning_rate": 4.834032538361607e-05, | |
| "loss": 0.0418, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 5.393700787401575, | |
| "grad_norm": 0.2937265634536743, | |
| "learning_rate": 4.768987077730509e-05, | |
| "loss": 0.0329, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 5.433070866141732, | |
| "grad_norm": 0.44668734073638916, | |
| "learning_rate": 4.703980774625038e-05, | |
| "loss": 0.0373, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 5.47244094488189, | |
| "grad_norm": 0.49580681324005127, | |
| "learning_rate": 4.6390246478486196e-05, | |
| "loss": 0.0393, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 5.511811023622047, | |
| "grad_norm": 0.41949307918548584, | |
| "learning_rate": 4.574129707699617e-05, | |
| "loss": 0.0336, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.551181102362205, | |
| "grad_norm": 0.7558311223983765, | |
| "learning_rate": 4.509306954105028e-05, | |
| "loss": 0.036, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 5.590551181102362, | |
| "grad_norm": 0.6058725118637085, | |
| "learning_rate": 4.4445673747559776e-05, | |
| "loss": 0.0389, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 5.6299212598425195, | |
| "grad_norm": 0.5574952960014343, | |
| "learning_rate": 4.3799219432452527e-05, | |
| "loss": 0.0441, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 5.669291338582677, | |
| "grad_norm": 0.3628334403038025, | |
| "learning_rate": 4.315381617207239e-05, | |
| "loss": 0.0306, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 5.708661417322834, | |
| "grad_norm": 0.6688554286956787, | |
| "learning_rate": 4.2509573364605695e-05, | |
| "loss": 0.0384, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 5.748031496062993, | |
| "grad_norm": 0.4437117874622345, | |
| "learning_rate": 4.1866600211537734e-05, | |
| "loss": 0.0359, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 5.78740157480315, | |
| "grad_norm": 0.5025441646575928, | |
| "learning_rate": 4.122500569914285e-05, | |
| "loss": 0.0339, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 5.826771653543307, | |
| "grad_norm": 0.6300631761550903, | |
| "learning_rate": 4.058489858001079e-05, | |
| "loss": 0.0412, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 5.866141732283465, | |
| "grad_norm": 0.33207225799560547, | |
| "learning_rate": 3.9946387354612754e-05, | |
| "loss": 0.0382, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 5.905511811023622, | |
| "grad_norm": 0.4854939877986908, | |
| "learning_rate": 3.930958025291021e-05, | |
| "loss": 0.0355, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.94488188976378, | |
| "grad_norm": 0.4515645205974579, | |
| "learning_rate": 3.867458521600943e-05, | |
| "loss": 0.0403, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 5.984251968503937, | |
| "grad_norm": 0.5812086462974548, | |
| "learning_rate": 3.804150987786525e-05, | |
| "loss": 0.045, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 6.0236220472440944, | |
| "grad_norm": 0.4998365640640259, | |
| "learning_rate": 3.7410461547036534e-05, | |
| "loss": 0.0372, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 6.062992125984252, | |
| "grad_norm": 0.2945879101753235, | |
| "learning_rate": 3.6781547188497135e-05, | |
| "loss": 0.0355, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 6.102362204724409, | |
| "grad_norm": 0.4646545946598053, | |
| "learning_rate": 3.6154873405504895e-05, | |
| "loss": 0.0271, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 6.141732283464567, | |
| "grad_norm": 0.5919941067695618, | |
| "learning_rate": 3.553054642153192e-05, | |
| "loss": 0.04, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 6.181102362204724, | |
| "grad_norm": 0.4582063555717468, | |
| "learning_rate": 3.4908672062259487e-05, | |
| "loss": 0.0308, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 6.2204724409448815, | |
| "grad_norm": 0.5543438196182251, | |
| "learning_rate": 3.428935573764005e-05, | |
| "loss": 0.0319, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 6.259842519685039, | |
| "grad_norm": 0.3909936249256134, | |
| "learning_rate": 3.367270242402999e-05, | |
| "loss": 0.0305, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 6.299212598425197, | |
| "grad_norm": 0.4638426601886749, | |
| "learning_rate": 3.30588166463957e-05, | |
| "loss": 0.0298, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.338582677165355, | |
| "grad_norm": 0.5119015574455261, | |
| "learning_rate": 3.2447802460596124e-05, | |
| "loss": 0.0299, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 6.377952755905512, | |
| "grad_norm": 0.4361736476421356, | |
| "learning_rate": 3.183976343574513e-05, | |
| "loss": 0.0279, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 6.417322834645669, | |
| "grad_norm": 0.7282920479774475, | |
| "learning_rate": 3.123480263665597e-05, | |
| "loss": 0.0305, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 6.456692913385827, | |
| "grad_norm": 0.4011118710041046, | |
| "learning_rate": 3.063302260637151e-05, | |
| "loss": 0.0294, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 6.496062992125984, | |
| "grad_norm": 0.44994768500328064, | |
| "learning_rate": 3.0034525348782855e-05, | |
| "loss": 0.0256, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 6.535433070866142, | |
| "grad_norm": 0.4626915454864502, | |
| "learning_rate": 2.9439412311339175e-05, | |
| "loss": 0.039, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 6.574803149606299, | |
| "grad_norm": 0.6001310348510742, | |
| "learning_rate": 2.8847784367852184e-05, | |
| "loss": 0.025, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 6.6141732283464565, | |
| "grad_norm": 0.38421395421028137, | |
| "learning_rate": 2.8259741801397477e-05, | |
| "loss": 0.0373, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 6.653543307086614, | |
| "grad_norm": 0.37881389260292053, | |
| "learning_rate": 2.7675384287316363e-05, | |
| "loss": 0.034, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 6.692913385826771, | |
| "grad_norm": 0.43850505352020264, | |
| "learning_rate": 2.709481087632041e-05, | |
| "loss": 0.0367, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.73228346456693, | |
| "grad_norm": 0.6985974907875061, | |
| "learning_rate": 2.6518119977702e-05, | |
| "loss": 0.0364, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 6.771653543307087, | |
| "grad_norm": 0.4026467502117157, | |
| "learning_rate": 2.5945409342653726e-05, | |
| "loss": 0.0363, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 6.811023622047244, | |
| "grad_norm": 0.36351197957992554, | |
| "learning_rate": 2.5376776047698965e-05, | |
| "loss": 0.03, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 6.850393700787402, | |
| "grad_norm": 0.2879717946052551, | |
| "learning_rate": 2.4812316478237353e-05, | |
| "loss": 0.0309, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 6.889763779527559, | |
| "grad_norm": 0.6626470685005188, | |
| "learning_rate": 2.4252126312206873e-05, | |
| "loss": 0.0431, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 6.929133858267717, | |
| "grad_norm": 0.4203033447265625, | |
| "learning_rate": 2.3696300503866204e-05, | |
| "loss": 0.0276, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 6.968503937007874, | |
| "grad_norm": 0.5207045078277588, | |
| "learning_rate": 2.314493326769968e-05, | |
| "loss": 0.0294, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 7.0078740157480315, | |
| "grad_norm": 0.4823305308818817, | |
| "learning_rate": 2.259811806244741e-05, | |
| "loss": 0.044, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 7.047244094488189, | |
| "grad_norm": 0.6036306023597717, | |
| "learning_rate": 2.2055947575263912e-05, | |
| "loss": 0.0283, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 7.086614173228346, | |
| "grad_norm": 0.5445898175239563, | |
| "learning_rate": 2.1518513706007155e-05, | |
| "loss": 0.0299, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 7.125984251968504, | |
| "grad_norm": 0.44029784202575684, | |
| "learning_rate": 2.0985907551661206e-05, | |
| "loss": 0.0349, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 7.165354330708661, | |
| "grad_norm": 0.4250989258289337, | |
| "learning_rate": 2.0458219390895106e-05, | |
| "loss": 0.0301, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 7.2047244094488185, | |
| "grad_norm": 0.31232279539108276, | |
| "learning_rate": 1.9935538668760057e-05, | |
| "loss": 0.0421, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 7.244094488188976, | |
| "grad_norm": 0.5633496642112732, | |
| "learning_rate": 1.9417953981528424e-05, | |
| "loss": 0.03, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 7.283464566929134, | |
| "grad_norm": 0.7122541666030884, | |
| "learning_rate": 1.890555306167619e-05, | |
| "loss": 0.0343, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 7.322834645669292, | |
| "grad_norm": 0.4240683615207672, | |
| "learning_rate": 1.8398422763011985e-05, | |
| "loss": 0.0244, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 7.362204724409449, | |
| "grad_norm": 0.3466864824295044, | |
| "learning_rate": 1.789664904595518e-05, | |
| "loss": 0.0265, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 7.4015748031496065, | |
| "grad_norm": 0.7498940229415894, | |
| "learning_rate": 1.7400316962965087e-05, | |
| "loss": 0.0303, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 7.440944881889764, | |
| "grad_norm": 0.38159969449043274, | |
| "learning_rate": 1.6909510644124455e-05, | |
| "loss": 0.0261, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 7.480314960629921, | |
| "grad_norm": 0.5678858160972595, | |
| "learning_rate": 1.642431328287899e-05, | |
| "loss": 0.035, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 7.519685039370079, | |
| "grad_norm": 0.4290473461151123, | |
| "learning_rate": 1.594480712193579e-05, | |
| "loss": 0.026, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 7.559055118110236, | |
| "grad_norm": 0.5522475242614746, | |
| "learning_rate": 1.547107343932299e-05, | |
| "loss": 0.029, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 7.5984251968503935, | |
| "grad_norm": 0.41189444065093994, | |
| "learning_rate": 1.5003192534612675e-05, | |
| "loss": 0.0243, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 7.637795275590551, | |
| "grad_norm": 0.1887262910604477, | |
| "learning_rate": 1.4541243715310005e-05, | |
| "loss": 0.0255, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 7.677165354330708, | |
| "grad_norm": 0.5400164127349854, | |
| "learning_rate": 1.4085305283410166e-05, | |
| "loss": 0.0251, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 7.716535433070866, | |
| "grad_norm": 0.5268674492835999, | |
| "learning_rate": 1.3635454522125946e-05, | |
| "loss": 0.036, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 7.755905511811024, | |
| "grad_norm": 0.6157100200653076, | |
| "learning_rate": 1.3191767682788003e-05, | |
| "loss": 0.0266, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 7.7952755905511815, | |
| "grad_norm": 0.3194139301776886, | |
| "learning_rate": 1.2754319971919842e-05, | |
| "loss": 0.0243, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 7.834645669291339, | |
| "grad_norm": 0.37460631132125854, | |
| "learning_rate": 1.2323185538490229e-05, | |
| "loss": 0.0357, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 7.874015748031496, | |
| "grad_norm": 0.37989112734794617, | |
| "learning_rate": 1.1898437461344518e-05, | |
| "loss": 0.0318, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 7.913385826771654, | |
| "grad_norm": 0.42782655358314514, | |
| "learning_rate": 1.1480147736817598e-05, | |
| "loss": 0.0263, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 7.952755905511811, | |
| "grad_norm": 0.5276915431022644, | |
| "learning_rate": 1.1068387266530267e-05, | |
| "loss": 0.025, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 7.9921259842519685, | |
| "grad_norm": 0.6227043271064758, | |
| "learning_rate": 1.0663225845371045e-05, | |
| "loss": 0.0296, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 8.031496062992126, | |
| "grad_norm": 0.2772444784641266, | |
| "learning_rate": 1.026473214966584e-05, | |
| "loss": 0.0346, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 8.070866141732283, | |
| "grad_norm": 0.4647983908653259, | |
| "learning_rate": 9.872973725536955e-06, | |
| "loss": 0.0308, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 8.11023622047244, | |
| "grad_norm": 0.32798609137535095, | |
| "learning_rate": 9.488016977453807e-06, | |
| "loss": 0.0248, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 8.149606299212598, | |
| "grad_norm": 0.2955688238143921, | |
| "learning_rate": 9.109927156977122e-06, | |
| "loss": 0.0268, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 8.188976377952756, | |
| "grad_norm": 0.3243492543697357, | |
| "learning_rate": 8.738768351698574e-06, | |
| "loss": 0.0273, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 8.228346456692913, | |
| "grad_norm": 0.2228180319070816, | |
| "learning_rate": 8.374603474377718e-06, | |
| "loss": 0.0276, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 8.26771653543307, | |
| "grad_norm": 0.4943491816520691, | |
| "learning_rate": 8.017494252278019e-06, | |
| "loss": 0.0338, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 8.307086614173228, | |
| "grad_norm": 0.26574063301086426, | |
| "learning_rate": 7.667501216703849e-06, | |
| "loss": 0.0286, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 8.346456692913385, | |
| "grad_norm": 0.561998188495636, | |
| "learning_rate": 7.324683692740259e-06, | |
| "loss": 0.0302, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 8.385826771653543, | |
| "grad_norm": 0.5270190238952637, | |
| "learning_rate": 6.989099789197112e-06, | |
| "loss": 0.0309, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 8.4251968503937, | |
| "grad_norm": 0.49484336376190186, | |
| "learning_rate": 6.660806388759505e-06, | |
| "loss": 0.0255, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 8.464566929133857, | |
| "grad_norm": 0.5083401799201965, | |
| "learning_rate": 6.339859138345838e-06, | |
| "loss": 0.0253, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 8.503937007874015, | |
| "grad_norm": 0.34489235281944275, | |
| "learning_rate": 6.026312439675552e-06, | |
| "loss": 0.0216, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 8.543307086614174, | |
| "grad_norm": 0.41160085797309875, | |
| "learning_rate": 5.720219440047797e-06, | |
| "loss": 0.0168, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 8.582677165354331, | |
| "grad_norm": 0.2923263609409332, | |
| "learning_rate": 5.421632023332779e-06, | |
| "loss": 0.0264, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 8.622047244094489, | |
| "grad_norm": 0.522784411907196, | |
| "learning_rate": 5.130600801177294e-06, | |
| "loss": 0.0222, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 8.661417322834646, | |
| "grad_norm": 0.413273423910141, | |
| "learning_rate": 4.8471751044257995e-06, | |
| "loss": 0.0312, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 8.700787401574804, | |
| "grad_norm": 0.4753558039665222, | |
| "learning_rate": 4.571402974758715e-06, | |
| "loss": 0.0285, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 8.740157480314961, | |
| "grad_norm": 0.36735445261001587, | |
| "learning_rate": 4.303331156549162e-06, | |
| "loss": 0.0303, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 8.779527559055119, | |
| "grad_norm": 0.3742741048336029, | |
| "learning_rate": 4.043005088939616e-06, | |
| "loss": 0.0166, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 8.818897637795276, | |
| "grad_norm": 0.2975374460220337, | |
| "learning_rate": 3.7904688981398485e-06, | |
| "loss": 0.0233, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 8.858267716535433, | |
| "grad_norm": 0.4250410497188568, | |
| "learning_rate": 3.5457653899473197e-06, | |
| "loss": 0.0343, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 8.89763779527559, | |
| "grad_norm": 0.41226938366889954, | |
| "learning_rate": 3.3089360424914674e-06, | |
| "loss": 0.0329, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 8.937007874015748, | |
| "grad_norm": 0.18084678053855896, | |
| "learning_rate": 3.080020999203026e-06, | |
| "loss": 0.0195, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 8.976377952755906, | |
| "grad_norm": 0.29392436146736145, | |
| "learning_rate": 2.8590590620095336e-06, | |
| "loss": 0.0257, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 9.015748031496063, | |
| "grad_norm": 0.2687474489212036, | |
| "learning_rate": 2.646087684758325e-06, | |
| "loss": 0.0274, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 9.05511811023622, | |
| "grad_norm": 0.1699579358100891, | |
| "learning_rate": 2.4411429668679043e-06, | |
| "loss": 0.0217, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 9.094488188976378, | |
| "grad_norm": 0.2965308725833893, | |
| "learning_rate": 2.2442596472089907e-06, | |
| "loss": 0.0266, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 9.133858267716535, | |
| "grad_norm": 0.29086264967918396, | |
| "learning_rate": 2.0554710982161607e-06, | |
| "loss": 0.0202, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 9.173228346456693, | |
| "grad_norm": 0.525842010974884, | |
| "learning_rate": 1.8748093202311078e-06, | |
| "loss": 0.0252, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 9.21259842519685, | |
| "grad_norm": 0.23323991894721985, | |
| "learning_rate": 1.7023049360784193e-06, | |
| "loss": 0.0257, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 9.251968503937007, | |
| "grad_norm": 0.3511553704738617, | |
| "learning_rate": 1.5379871858749784e-06, | |
| "loss": 0.0341, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 9.291338582677165, | |
| "grad_norm": 0.2637965977191925, | |
| "learning_rate": 1.3818839220735792e-06, | |
| "loss": 0.0207, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 9.330708661417322, | |
| "grad_norm": 0.212178036570549, | |
| "learning_rate": 1.2340216047418695e-06, | |
| "loss": 0.0207, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 9.37007874015748, | |
| "grad_norm": 0.38010916113853455, | |
| "learning_rate": 1.094425297077295e-06, | |
| "loss": 0.0239, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 9.409448818897637, | |
| "grad_norm": 0.5460922718048096, | |
| "learning_rate": 9.631186611587405e-07, | |
| "loss": 0.0216, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 9.448818897637794, | |
| "grad_norm": 0.3007522523403168, | |
| "learning_rate": 8.401239539358008e-07, | |
| "loss": 0.0301, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 9.488188976377952, | |
| "grad_norm": 0.279300719499588, | |
| "learning_rate": 7.254620234560583e-07, | |
| "loss": 0.0313, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 9.527559055118111, | |
| "grad_norm": 0.6319423913955688, | |
| "learning_rate": 6.191523053313386e-07, | |
| "loss": 0.0298, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 9.566929133858268, | |
| "grad_norm": 0.5479516983032227, | |
| "learning_rate": 5.212128194432509e-07, | |
| "loss": 0.0222, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 9.606299212598426, | |
| "grad_norm": 0.44871729612350464, | |
| "learning_rate": 4.3166016688879205e-07, | |
| "loss": 0.021, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 9.645669291338583, | |
| "grad_norm": 0.5158320665359497, | |
| "learning_rate": 3.505095271663705e-07, | |
| "loss": 0.0174, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 9.68503937007874, | |
| "grad_norm": 0.3390265107154846, | |
| "learning_rate": 2.7777465560285265e-07, | |
| "loss": 0.0224, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 9.724409448818898, | |
| "grad_norm": 0.39901813864707947, | |
| "learning_rate": 2.1346788102196148e-07, | |
| "loss": 0.0181, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 9.763779527559056, | |
| "grad_norm": 0.4687666893005371, | |
| "learning_rate": 1.5760010365450938e-07, | |
| "loss": 0.0213, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 9.803149606299213, | |
| "grad_norm": 0.5264361500740051, | |
| "learning_rate": 1.1018079329076503e-07, | |
| "loss": 0.0249, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 9.84251968503937, | |
| "grad_norm": 0.5656106472015381, | |
| "learning_rate": 7.121798767530385e-08, | |
| "loss": 0.0217, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 9.881889763779528, | |
| "grad_norm": 0.3701815903186798, | |
| "learning_rate": 4.071829114455361e-08, | |
| "loss": 0.0198, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 9.921259842519685, | |
| "grad_norm": 0.2982766628265381, | |
| "learning_rate": 1.868687350736198e-08, | |
| "loss": 0.0219, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 9.960629921259843, | |
| "grad_norm": 0.3252989947795868, | |
| "learning_rate": 5.1274691686697965e-09, | |
| "loss": 0.021, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.5611906051635742, | |
| "learning_rate": 4.23764965562512e-11, | |
| "loss": 0.0357, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 2540, | |
| "total_flos": 0.0, | |
| "train_loss": 0.06096198053106548, | |
| "train_runtime": 2181.3746, | |
| "train_samples_per_second": 31.434, | |
| "train_steps_per_second": 1.164 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2540, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 27, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |